In [1]:
import pandas as pd
import numpy as np
import psycopg2
from sqlalchemy import create_engine
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler
import statsmodels.api as sm
import matplotlib.pyplot as plt

In [2]:
pd.set_option('display.max_columns', 100)
pd.set_option("max_rows", 1000)
pd.set_option('display.float_format', lambda x: '%.5f' % x)

In [3]:
connect = 'postgresql+psycopg2://postgres:password@localhost:5432/yelp'
engine = create_engine(connect)
query = '''
        SELECT *
        FROM model_data_cls_train
        LIMIT 1000000
        ;
        '''
df = pd.read_sql(query, con=engine)

data = df.copy()

In [4]:
data = data.drop_duplicates(subset='review_id')
data.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 911967 entries, 0 to 999999
Data columns (total 23 columns):
 #   Column                           Non-Null Count   Dtype  
---  ------                           --------------   -----  
 0   level_0                          911967 non-null  int64  
 1   index                            911967 non-null  int64  
 2   review_id                        911967 non-null  object 
 3   review_stars                     911967 non-null  int64  
 4   review_stars_v_user_avg          911967 non-null  float64
 5   review_stars_v_restaurant_avg    911967 non-null  float64
 6   restaurant_latitude              911967 non-null  float64
 7   restaurant_longitude             911967 non-null  float64
 8   restaurant_overall_stars         911967 non-null  float64
 9   restaurant_review_count          911967 non-null  int64  
 10  restaurant_checkin_count         911967 non-null  int64  
 11  restaurant_is_open               911967 non-null  int64  
 12  re

In [5]:
target = data['TARGET_review_has_upvotes']
unused_features = ['level_0', 'index', 'review_id', 
                   'restaurant_latitude', 
                   'restaurant_longitude',
                   'TARGET_review_has_upvotes']
features = data.drop(labels=unused_features, axis=1)

In [6]:
print(f'Target Shape: {target.shape}')
print(target.head(10))      
print(features.info())

Target Shape: (911967,)
0    0
1    1
2    1
3    1
4    0
5    1
6    0
7    1
8    0
9    1
Name: TARGET_review_has_upvotes, dtype: int64
<class 'pandas.core.frame.DataFrame'>
Int64Index: 911967 entries, 0 to 999999
Data columns (total 17 columns):
 #   Column                           Non-Null Count   Dtype  
---  ------                           --------------   -----  
 0   review_stars                     911967 non-null  int64  
 1   review_stars_v_user_avg          911967 non-null  float64
 2   review_stars_v_restaurant_avg    911967 non-null  float64
 3   restaurant_overall_stars         911967 non-null  float64
 4   restaurant_review_count          911967 non-null  int64  
 5   restaurant_checkin_count         911967 non-null  int64  
 6   restaurant_is_open               911967 non-null  int64  
 7   restaurant_price                 911967 non-null  int64  
 8   user_average_stars_given         911967 non-null  float64
 9   user_review_count                911967 non-null  i

In [7]:
scalar = StandardScaler()
scaled_features = scalar.fit_transform(features)

In [8]:
X_train, X_test, y_train, y_test = train_test_split(scaled_features, target, test_size=0.20, random_state=5)

In [9]:
log_reg = LogisticRegression(random_state=5)
log_reg.fit(X_train, y_train)
accuracy_score = log_reg.score(X_test, y_test)
print(f'Test Accuracy Score: {accuracy_score}')

Test Accuracy Score: 0.6430968123951446


In [10]:
coef = zip(list(features.columns), list(log_reg.coef_)[0])
sorted_coef = sorted(coef, key=lambda x: x[1], reverse=True)
print('Feature Coefficients:')
for f, c in sorted_coef:
    print(f'{c:.3f} - {f}')

Feature Coefficients:
1.065 - user_fans
1.022 - user_upvotes
0.285 - user_friend_count
0.047 - restaurant_checkin_count
0.036 - restaurant_price
0.031 - user_elite_count
0.031 - restaurant_overall_stars
0.019 - user_days_active_at_review_time
-0.034 - user_average_stars_given
-0.064 - review_stars_v_user_avg
-0.073 - review_stars
-0.094 - restaurant_is_open
-0.095 - review_stars_v_restaurant_avg
-0.142 - restaurant_review_count
-0.254 - user_compliments
-0.284 - user_review_count
-0.397 - user_years_since_last_elite
