In [763]:
import numpy as np
import pandas as pd
import statsmodels.api as sm
import matplotlib.pyplot as plt
from patsy import dmatrices
from sklearn.linear_model import LogisticRegression
from sklearn.cross_validation import train_test_split
from sklearn import metrics
from sklearn.cross_validation import cross_val_score
from sklearn.preprocessing import OneHotEncoder

#Import .csv file to pandas
df = pd.read_csv('International_Yelp.csv')

#Create input and output data sets
X = df.drop('stars',1)
Y = df.stars

#Create a new binary output variable Success, Success = 1 if stars>=4, else 0 
df['success'] = [1 if x>=4 else 0 for x in df['stars'] ]

X = df.drop('stars',1)
X = X.drop('success',1)
Y = df.success

#X , Y



In [764]:
# A function to covert all the categorical predictors to numeric 
def dummy_df(df, list):
    for x in list:
        dummies = pd.get_dummies(df[x], prefix=x, dummy_na = False)
        df = df.drop(x,1)
        df = pd.concat([df, dummies], axis=1)
    return df 

#Create a list of categorical predictors 

list = ['Good_For_Latenight', 'Outdoor_Seating' , 'Alcohol', 'Ambience_classy' , 'Parking_Lot' , 'Ambience_Touristy' , 
         'Good_For_Brunch' , 'Waiter_Service' , 'Parking_Street' , 'Ambience_Hipster' , 'Good_For_Breakfast' , 
         'Parking_Garage' , 'Accepts_Credit_Cards' , 'Good_For_Lunch' , 'valet','Take_out','Good_For_dessert' ,
         'Takes_Reservations' , 'Ambience_Trendy' , 'Delivery' , 'WiFi', 'Wheelchair_Accessible' ,
         'Caters' , 'Good_For_Dinner','Good_For_Kids' , 'Parking_Validated', 'Has_TV' , 'Ambience_Casual',
         'Drive_Thru', 'Noise_Level' ,'Smoking' , 'Attire' , 'Good_For_Groups']

X = dummy_df(X,list)


In [765]:
# Removing NaN values 
from sklearn.preprocessing import Imputer
imp = Imputer(missing_values= 'NaN', strategy = 'median' , axis = 0)
imp.fit(X)
X = pd.DataFrame(imp.transform(X), columns=X.columns)


In [766]:
# normalization of values
from sklearn import preprocessing

min_max_scaler = preprocessing.MinMaxScaler()
#X = min_max_scaler.fit_transform(X)
X = pd.DataFrame(min_max_scaler.fit_transform(X), columns=X.columns)


In [767]:
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, train_size=0.7, random_state=1)
print (df.shape)
print (X.shape)
print(X_train.shape)
print(X_test.shape)

(26729, 37)
(26729, 74)
(18710, 74)
(8019, 74)


In [768]:
#Feature selection using Chi square 
import sklearn.feature_selection
from sklearn.feature_selection import chi2
select = sklearn.feature_selection.SelectKBest(score_func=chi2,k=60)
#select = sklearn.feature_selection.SelectKBest(k=60)
selected_features = select.fit(X_train, Y_train)
indices_selected = selected_features.get_support(indices=True)
colnames_selected = [X.columns[i] for i in indices_selected]

X_train_selected = X_train[colnames_selected]
X_test_selected = X_test[colnames_selected]

In [769]:
# check which columns were selected 
colnames_selected

['Price_Range',
 'review_count',
 'Good_For_Latenight_False',
 'Good_For_Latenight_True',
 'Outdoor_Seating_False',
 'Outdoor_Seating_True',
 'Alcohol_beer_and_wine',
 'Alcohol_full_bar',
 'Ambience_classy_False',
 'Ambience_classy_True',
 'Parking_Lot_True',
 'Ambience_Touristy_False',
 'Ambience_Touristy_True',
 'Good_For_Brunch_True',
 'Waiter_Service_False',
 'Waiter_Service_True',
 'Parking_Street_False',
 'Parking_Street_True',
 'Ambience_Hipster_False',
 'Ambience_Hipster_True',
 'Good_For_Breakfast_False',
 'Good_For_Breakfast_True',
 'Parking_Garage_False',
 'Parking_Garage_True',
 'Accepts_Credit_Cards_False',
 'Accepts_Credit_Cards_True',
 'Good_For_Lunch_False',
 'Good_For_Lunch_True',
 'valet_False',
 'valet_True',
 'Take_out_False',
 'Good_For_dessert_True',
 'Takes_Reservations_False',
 'Takes_Reservations_True',
 'Ambience_Trendy_False',
 'Ambience_Trendy_True',
 'Delivery_False',
 'WiFi_free',
 'WiFi_no',
 'WiFi_paid',
 'Wheelchair_Accessible_False',
 'Wheelchair_Acces

In [770]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import roc_auc_score

# Function to calculate AUC using roc_auc_score
def model_score(x_train, y_train, x_test, y_test):
    model = LogisticRegression(penalty='l2', C=100)
    model.fit(x_train, Y_train)
    y_hat = [x[1] for x in model.predict_proba(x_test)]
    auc = roc_auc_score(y_test, y_hat)
    return auc

auc_processed = model_score(X_train_selected, Y_train, X_test_selected, Y_test)
print auc_processed


0.69064668763


In [771]:
#Caluculate Model score by applying logistic regression
model = LogisticRegression(penalty='l2', C=10)
model.fit(X_train_selected, Y_train)
model.score(X_train_selected, Y_train, sample_weight=None)

0.66750400855157666

In [772]:
#Calculate Model score for test data using the same model
model.score(X_test_selected, Y_test, sample_weight=None)

0.66305025564284825

In [773]:
#Calculate 10 fold cross validation score
scores = cross_val_score(LogisticRegression(), X, Y, scoring='accuracy', cv=10)
print scores
print scores.mean()

[ 0.66529544  0.6525804   0.64098728  0.64908343  0.6187804   0.68200524
  0.62874251  0.64483533  0.64708084  0.62163174]
0.645102260654


In [16]:
#Predict Probabilty of a successful restaurant

model.predict_proba(np.array([2, 10, 0, 1, 0, 1, 1, 0, 0, 0, 0, 0, 1, 1, 0, 0,
                              1, 1, 1, 0]))



array([[  4.66663165e-07,   9.99999533e-01]])

In [724]:
import statsmodels.api as sm

logit = sm.Logit(Y, X)
result = logit.fit()

         Current function value: 0.591349
         Iterations: 35




In [725]:
# Print Model Stat Summary 
print result.summary()


                           Logit Regression Results                           
Dep. Variable:                success   No. Observations:                21066
Model:                          Logit   Df Residuals:                    21001
Method:                           MLE   Df Model:                           64
Date:                Wed, 30 Nov 2016   Pseudo R-squ.:                 0.09928
Time:                        23:39:22   Log-Likelihood:                -12457.
converged:                      False   LL-Null:                       -13830.
                                        LLR p-value:                     0.000
                                  coef    std err          z      P>|z|      [95.0% Conf. Int.]
-----------------------------------------------------------------------------------------------
Price_Range                    -0.3480      0.110     -3.172      0.002        -0.563    -0.133
review_count                   20.7211      1.032     20.085      0.000        1