In [17]:
# imports
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from scipy import stats
from sklearn.feature_selection import RFE
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import Ridge
from sklearn.metrics import mean_squared_error
from sklearn.metrics import confusion_matrix
from sklearn.metrics import roc_curve
from sklearn.metrics import roc_auc_score
from sklearn.neighbors import KNeighborsClassifier
# from sklearn.model_selection import GridSearchCV, cross_val_score, train_test_split
from sklearn.feature_selection import SelectPercentile, f_classif
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.cluster import KMeans
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import silhouette_samples, silhouette_score
from sklearn.metrics import accuracy_score
from sklearn.metrics import classification_report, confusion_matrix
# from sklearn.model_selection import GridSearchCV, cross_val_score, train_test_split
from sklearn.metrics import classification_report
from matplotlib.colors import ListedColormap
from sklearn.preprocessing import StandardScaler
from sklearn.naive_bayes import GaussianNB
from sklearn.neural_network import MLPClassifier
from sklearn import neighbors
from sklearn.svm import SVC
from sklearn import svm
from sklearn import grid_search
import random

%matplotlib inline

import warnings
warnings.filterwarnings('ignore')

<b> Defining the filling empty values function </b>

In [137]:
import sys

def fillEmpty(original_df,colnum,flag):
    # assumed: df is the dataframe to operate on,
    # colnum is the column number with missing values
    # flag = 0/1 is whether it is a classification or regression problem
    
    # copying the original dataframe
    df = original_df.copy()
    
    # testing for valid flag
    if(flag != 0 and flag != 1):
        print('Invalid input flag')
        sys.exit()
    # testing for valid column number
    number_of_columns = len(list(original_df))
    if(colnum < 0 or colnum >= number_of_columns):
        print('Invalid input column number')
        sys.exit()
    # testing for the existence of empty column values
    a = df.iloc[:, [colnum]].isnull()
    idx = []
    for col in a:
        i=0
        for c in a[col]:
            if(c == True):
                idx.append(i)
            i=i+1
    if(len(idx) == 0):
        print('No empty values for input column number')
        sys.exit()
    
    # now can start pre-processing:
    # This converts all columns with "object" variables (AKA string) into numbers, and creates a dictionary  
    char_cols = df.dtypes.pipe(lambda x: x[x == 'object']).index
    label_mapping = {}
    for c in char_cols:
        df[c], label_mapping[c] = pd.factorize(df[c])
    # Accessing the rows without empty values at colnum
    df_complete = df.dropna()
    df_complete.shape
    # Accessing the rows with empty values at colnum
    df_empty = df.iloc[idx]
    df_empty.shape
    # Splitting complete rows into target/features
    features = df_complete.drop(df.columns[[colnum]], axis=1)
    target_variable = df_complete.iloc[:, [colnum]]
    # Splitting the rows with empty colnum into features and response (which is what we're predicting)
    features_empty = df_empty.drop(df.columns[[colnum]], axis=1)
    
    # now can start classifying/ predicting:
    if(flag==0): # classification
        from sklearn.ensemble import RandomForestClassifier
        print('Classifying...')
        print()
        # Training random forest
        randFor = RandomForestClassifier(n_estimators = 20)
        randFor.fit(features, target_variable)
        # Accuracy on the training set set
        print('Training score: ',randFor.score(features, target_variable))
        # Set of "City Group" predictions for the rows with empty values 
        y_pred_randFor = randFor.predict(features_empty)
        print(y_pred_randFor)
    else: # prediction
        from sklearn.linear_model import Ridge
        print('Predicting...')
        print()
        # Ridge Regression
        ridgereg = Ridge(normalize=True)
        ridgereg.fit(features,target_variable)
        y_pred_ridge = ridgereg.predict(features_empty)
        print (y_pred_ridge)

<b> Testing on Restaurant.csv, revenue </b>

In [15]:
original_df = pd.read_csv("Restaurant.csv")
original_df.head()

Unnamed: 0,Id,Open Date,City,City Group,Type,P1,P2,P3,P4,P5,...,P29,P30,P31,P32,P33,P34,P35,P36,P37,revenue
0,0,07/17/1999,İstanbul,Big Cities,IL,4,5.0,4.0,4.0,2,...,3.0,5,3,4,5,5,4,3,4,5653753.0
1,1,02/14/2008,Ankara,Big Cities,FC,4,5.0,4.0,4.0,1,...,3.0,0,0,0,0,0,0,0,0,6923131.0
2,2,03/09/2013,Diyarbakır,Other,IL,2,4.0,2.0,5.0,2,...,3.0,0,0,0,0,0,0,0,0,2055379.0
3,3,02/02/2012,Tokat,Other,IL,6,4.5,6.0,6.0,4,...,7.5,25,12,10,6,18,12,12,6,2675511.0
4,4,05/09/2009,Gaziantep,Other,IL,3,4.0,3.0,4.0,2,...,3.0,5,1,3,2,3,4,3,3,4316715.0


In [104]:
df = original_df.copy()
# List of random numbers (no repeats) between 1 and 137 and then delete A_follower_count[that row]   
rows_to_delete = random.sample(range(137), 30)
# Deleting values from those rows
for x in rows_to_delete:
    df['revenue'][x] = np.nan
# Creating array of the correct answers from original data frame  
deleted_answer_list = []
for x in rows_to_delete:
    deleted_answer_list.append(original_df['revenue'][x])
# Converting it to array from a list so we can perform certain calculations
deleted_answer_array = np.array(deleted_answer_list)
deleted_answer_array

array([  2267425.,  16549064.,   3376145.,   4067566.,   2097022.,
         5435276.,   1756069.,   3008199.,   4316715.,   1734634.,
         2383840.,   5017319.,   4264176.,   3753720.,  19696939.,
         2740687.,   5595267.,   5525735.,   2018785.,   3447890.,
         3982767.,   6694797.,   5337526.,   3600467.,   3351383.,
         2544857.,   4651866.,   1270499.,   2058644.,   4219263.])

In [138]:
# test the function
fillEmpty(df,42,1)

Predicting...

[[ 3817705.77384499]
 [ 4728598.14163705]
 [ 4052049.41928976]
 [ 4620456.33422125]
 [ 4264734.68395215]
 [ 4129408.41800465]
 [ 4465818.57630399]
 [ 4057981.35320396]
 [ 4204132.75542589]
 [ 3583284.53948341]
 [ 4614396.83699973]
 [ 4271192.23781953]
 [ 4348403.27241239]
 [ 4112762.90555375]
 [ 3914225.17760785]
 [ 4635992.16876365]
 [ 4728512.36355894]
 [ 4624400.38770224]
 [ 4885197.65996617]
 [ 4005196.49072495]
 [ 4123320.84794637]
 [ 4230834.49299511]
 [ 4956874.4874912 ]
 [ 5072973.19354393]
 [ 4113509.03807125]
 [ 5034484.13433951]
 [ 4073283.98351226]
 [ 5125491.58974022]
 [ 4233604.36083853]
 [ 4407734.15561198]]


<b> Testing on Restaurant.csv, City Group </b>

In [106]:
df2 = original_df.copy()
# List of random numbers (no repeats) between 1 and 137 and then delete A_follower_count[that row]   
rows_to_delete = random.sample(range(137), 30)
# Deleting values from those rows
for x in rows_to_delete:
    df2['City Group'][x] = np.nan
# Creating array of the correct answers from original data frame  
deleted_answer_list = []
for x in rows_to_delete:
    deleted_answer_list.append(original_df['City Group'][x])
# Converting it to array from a list so we can perform certain calculations
deleted_answer_array_city_group = np.array(deleted_answer_list)
deleted_answer_array_city_group

array(['Big Cities', 'Big Cities', 'Big Cities', 'Other', 'Big Cities',
       'Big Cities', 'Big Cities', 'Big Cities', 'Big Cities', 'Other',
       'Big Cities', 'Other', 'Big Cities', 'Other', 'Other', 'Big Cities',
       'Other', 'Big Cities', 'Other', 'Big Cities', 'Big Cities', 'Other',
       'Other', 'Other', 'Big Cities', 'Big Cities', 'Big Cities', 'Other',
       'Big Cities', 'Big Cities'], 
      dtype='<U10')

In [141]:
# test the function
fillEmpty(df2,3,0) # TODO. problem with cleaning the data when null values exist

Classifying...

Training score:  1.0
[-1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1
 -1 -1 -1 -1 -1]
