In [93]:
import numpy as np
import pandas as pd
import seaborn as sns
from sklearn.feature_selection import RFE
from sklearn.linear_model import Ridge
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.metrics import mean_squared_error
import random

%matplotlib inline

import warnings
warnings.filterwarnings('ignore')

In [94]:
# Loading the df (the encoding thing might be an issue...let's play around with that) 
original_df = pd.read_csv("Restaurant_With_Empties.csv", encoding = "ISO-8859-1")

In [95]:
# Looking to see the number of empties in each column (there should be 137 in each column if there were no empties) 
original_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 137 entries, 0 to 136
Data columns (total 43 columns):
Id            137 non-null int64
Open Date     137 non-null object
City          137 non-null object
City Group    126 non-null object
Type          137 non-null object
P1            137 non-null int64
P2            137 non-null float64
P3            126 non-null float64
P4            137 non-null float64
P5            137 non-null int64
P6            137 non-null int64
P7            137 non-null int64
P8            137 non-null int64
P9            128 non-null float64
P10           137 non-null int64
P11           137 non-null int64
P12           97 non-null float64
P13           137 non-null float64
P14           137 non-null int64
P15           137 non-null int64
P16           137 non-null int64
P17           137 non-null int64
P18           137 non-null int64
P19           137 non-null int64
P20           137 non-null int64
P21           137 non-null int64
P22           137 non-

In [96]:
# Creating a list of any column with empty values
cols_with_empties = original_df.columns[original_df.isnull().any()].tolist()
cols_with_empties

['City Group', 'P3', 'P9', 'P12', 'revenue']

In [97]:
# Things to consider - letting users select which columns they want to replace empties in (or all)

In [98]:
# Inserting temporary row for 0 (regression) or 1 (categorical)

In [99]:
original_df.head()

Unnamed: 0,Id,Open Date,City,City Group,Type,P1,P2,P3,P4,P5,...,P29,P30,P31,P32,P33,P34,P35,P36,P37,revenue
0,0,7/17/99,ÛÁstanbul,Big Cities,IL,4,5.0,4.0,4.0,2,...,3.0,5,3,4,5,5,4,3,4,5653753.0
1,1,2/14/08,Ankara,Big Cities,FC,4,5.0,,4.0,1,...,3.0,0,0,0,0,0,0,0,0,6923131.0
2,2,3/9/13,DiyarbakÛ±r,Other,IL,2,4.0,2.0,5.0,2,...,3.0,0,0,0,0,0,0,0,0,2055379.0
3,3,2/2/12,Tokat,,IL,6,4.5,6.0,6.0,4,...,7.5,25,12,10,6,18,12,12,6,2675511.0
4,4,5/9/09,Gaziantep,Other,IL,3,4.0,3.0,4.0,2,...,3.0,5,1,3,2,3,4,3,3,4316715.0


In [100]:
# Finding number of columns (will be used in next portion)
number_of_columns = len(list(original_df))
number_of_columns

43

In [101]:
# Create one empty row up top of 0's that we will eventually fill in 
original_df.loc[-1] = np.zeros(number_of_columns)
original_df.index = original_df.index + 1  # shifting index
original_df = original_df.sort_index()  # sorting by index

In [102]:
# Making sure this worked
original_df.head()

Unnamed: 0,Id,Open Date,City,City Group,Type,P1,P2,P3,P4,P5,...,P29,P30,P31,P32,P33,P34,P35,P36,P37,revenue
0,0.0,0,0,0,0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,7/17/99,ÛÁstanbul,Big Cities,IL,4.0,5.0,4.0,4.0,2.0,...,3.0,5.0,3.0,4.0,5.0,5.0,4.0,3.0,4.0,5653753.0
2,1.0,2/14/08,Ankara,Big Cities,FC,4.0,5.0,,4.0,1.0,...,3.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,6923131.0
3,2.0,3/9/13,DiyarbakÛ±r,Other,IL,2.0,4.0,2.0,5.0,2.0,...,3.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2055379.0
4,3.0,2/2/12,Tokat,,IL,6.0,4.5,6.0,6.0,4.0,...,7.5,25.0,12.0,10.0,6.0,18.0,12.0,12.0,6.0,2675511.0


In [103]:
# Acually inserting values into first row, with 0=regression and 1=categorical. For now, it's based solely on data 
# type, which will need to be changed

In [104]:
# You can delete this later, but here you can see I'm checking out the data type of values from different columns 
# in the third row. O="object" and "float64" = float (example below)
original_df["City Group"].iloc[2:3].dtype

dtype('O')

In [105]:
original_df["P1"].iloc[2:3].dtype

dtype('float64')

In [106]:
# Loop to check data type of the second row of each column's data type is O (object, aka string). If so, the first 
# valuein that row is replaced with a 1, indicating that it's categorical and not a regression problem
for (counter, column) in enumerate(original_df):
    if original_df[column].iloc[2:3].dtype == 'O':
        original_df[column].iloc[0] = 1
        
# Maybe we should add other elifs like if the column name contains "date" or "zip" we can assume it's categorical? 

In [107]:
# Checking to make sure it worked, which it did
original_df.head()

Unnamed: 0,Id,Open Date,City,City Group,Type,P1,P2,P3,P4,P5,...,P29,P30,P31,P32,P33,P34,P35,P36,P37,revenue
0,0.0,1,1,1,1,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,7/17/99,ÛÁstanbul,Big Cities,IL,4.0,5.0,4.0,4.0,2.0,...,3.0,5.0,3.0,4.0,5.0,5.0,4.0,3.0,4.0,5653753.0
2,1.0,2/14/08,Ankara,Big Cities,FC,4.0,5.0,,4.0,1.0,...,3.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,6923131.0
3,2.0,3/9/13,DiyarbakÛ±r,Other,IL,2.0,4.0,2.0,5.0,2.0,...,3.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2055379.0
4,3.0,2/2/12,Tokat,,IL,6.0,4.5,6.0,6.0,4.0,...,7.5,25.0,12.0,10.0,6.0,18.0,12.0,12.0,6.0,2675511.0


In [108]:
# This converts all columns with "object" variables (AKA string) into numbers, and creates a dictionary  
row_one = original_df.iloc[0]
char_cols = original_df.dtypes.pipe(lambda x: x[x == 'object']).index
label_mapping = {}

for c in char_cols:
    original_df[c], label_mapping[c] = pd.factorize(original_df[c])

# This part is important because it brings the first row back to the 0/1's depending on whether it's categorical    
original_df.iloc[0] = row_one

In [109]:
# Here you can see the order it assigned the ints to string. In the City column, Istanbul=0, Ankara=1, etc. We 
# wouldn't actually print this out 
label_mapping

{'City': Index([               1,      'ÛÁstanbul',         'Ankara',    'DiyarbakÛ±r',
                 'Tokat',      'Gaziantep', 'Afyonkarahisar',         'Edirne',
               'Kocaeli',          'Bursa',         'ÛÁzmir',        'Sakarya',
              'ElazÛ±Ûô',        'Kayseri',     'Eskiôehir',    '_anlÛ±urfa',
                'Samsun',          'Adana',        'Antalya',      'Kastamonu',
                 'Uôak',         'MuÛôla',    'KÛ±rklareli',          'Konya',
              'Karabí_k',      'TekirdaÛô',        'Denizli',     'BalÛ±kesir',
                'AydÛ±n',         'Amasya',       'Kí_tahya',           'Bolu',
               'Trabzon',        'Isparta',       'Osmaniye'],
       dtype='object'),
 'City Group': Index([1, 'Big Cities', 'Other'], dtype='object'),
 'Open Date': Index([         1,  '7/17/99',  '2/14/08',   '3/9/13',   '2/2/12',   '5/9/09',
         '2/12/10', '10/11/10',  '6/21/11',  '8/28/10',
        ...
          '9/7/07', '10/14/11',   '2/8

In [110]:
# If you want to access the mapping for one specific column you can do it this way
label_mapping["City Group"]

Index([1, 'Big Cities', 'Other'], dtype='object')

In [111]:
# Checking out df to make sure string values were recoded into numbers
original_df.head()

Unnamed: 0,Id,Open Date,City,City Group,Type,P1,P2,P3,P4,P5,...,P29,P30,P31,P32,P33,P34,P35,P36,P37,revenue
0,0.0,1,1,1,1,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,1,1,1,1,4.0,5.0,4.0,4.0,2.0,...,3.0,5.0,3.0,4.0,5.0,5.0,4.0,3.0,4.0,5653753.0
2,1.0,2,2,1,2,4.0,5.0,,4.0,1.0,...,3.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,6923131.0
3,2.0,3,3,2,1,2.0,4.0,2.0,5.0,2.0,...,3.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2055379.0
4,3.0,4,4,-1,1,6.0,4.5,6.0,6.0,4.0,...,7.5,25.0,12.0,10.0,6.0,18.0,12.0,12.0,6.0,2675511.0


In [None]:
# *Note to self - pick back up here tomorrow AM and start training the models 

In [None]:
# At the end, need to map string