In [20]:
import pandas as pd
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.linear_model import LogisticRegression
from matplotlib import pyplot as plt
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier

In [2]:
df = pd.read_csv('../data/mostly_cleaned.csv')

In [3]:
df.drop(columns=['essay0'
                 ,'essay1'
                 ,'essay2'
                 ,'essay3'
                 ,'essay4'
                 ,'essay5'
                 ,'essay6'
                 ,'essay7'
                 ,'essay8'
                 ,'essay9'
                 , 'Unnamed: 0'
                 , 'religion'
                 , 'sign', 'ethnicity', 'speaks'], inplace=True)

In [4]:
df.columns

Index(['age', 'status', 'sex', 'orientation', 'body_type', 'diet', 'drinks',
       'drugs', 'education', 'height', 'job', 'offspring', 'pets', 'smokes',
       'sign_actual', 'sign_seriousness', 'religion_actual',
       'religion_seriousness'],
      dtype='object')

In [5]:
simple_predictors = df.copy()
simple_predictors.drop(columns=['sign_actual','sign_seriousness'], inplace=True)
dummified = pd.get_dummies(simple_predictors)
dummified.columns

Index(['age', 'height', 'status_available', 'status_married',
       'status_seeing someone', 'status_single', 'status_unknown', 'sex_f',
       'sex_m', 'orientation_bisexual',
       ...
       'religion_actual_hinduism', 'religion_actual_islam',
       'religion_actual_judaism', 'religion_actual_no_answer',
       'religion_actual_other', 'religion_seriousness_and laughing about it',
       'religion_seriousness_and somewhat serious about it',
       'religion_seriousness_and very serious about it',
       'religion_seriousness_but not too serious about it',
       'religion_seriousness_no_answer'],
      dtype='object', length=163)

In [6]:
X = dummified
y = df.sign_actual

In [7]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20)

In [8]:
X_train.shape, X_test.shape, y_train.shape, y_test.shape

((39112, 163), (9778, 163), (39112,), (9778,))

In [9]:
LR = LogisticRegression(multi_class='multinomial', max_iter=1000)
LR.fit(X_train, y_train)
LR_pred = LR.predict(X_test)
LR_results = pd.DataFrame(data={'Prediction':LR_pred, 'Actual':y_test})
LR_correct_predictions = 0
for row in LR_results.index:
    if LR_results.loc[row][0] == LR_results.loc[row][1]:
        LR_correct_predictions +=1
        
(LR_correct_predictions / len(LR_results))*100

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


8.181632235631008

In [17]:
LR.score(X_test, y_test)

0.08181632235631009

In [10]:
RF = RandomForestClassifier()
RF.fit(X_train, y_train)
RF_pred = RF.predict(X_test)

RF_results = pd.DataFrame(data={'Prediction':RF_pred, 'Actual':y_test})
RF_correct_predictions = 0
for row in RF_results.index:
    if RF_results.loc[row][0] == RF_results.loc[row][1]:
        RF_correct_predictions +=1
        
(RF_correct_predictions / len(RF_results))*100

8.345264880343628

In [18]:
RF.score(X_test, y_test)

0.08345264880343628

In [11]:
important_features_dict = {}
for idx, val in enumerate(RF.feature_importances_):
    important_features_dict[idx] = val

important_features_list = sorted(important_features_dict,
                                 key=important_features_dict.get,
                                 reverse=True)

print(f'5 most important features: {important_features_list[:5]}')

5 most important features: [0, 1, 34, 124, 140]


In [12]:
X_train.columns[0], X_train.columns[1], X_train.columns[34], X_train.columns[124], X_train.columns[64]

('age',
 'height',
 'diet_no_answer',
 'offspring_no_answer',
 'education_graduated from college/university')

In [13]:
GB = GradientBoostingClassifier()
GB.fit(X_train, y_train)
GB_pred = GB.predict(X_test)

GB_results = pd.DataFrame(data={'Prediction':GB_pred, 'Actual':y_test})
GB_correct_predictions = 0
for row in GB_results.index:
    if GB_results.loc[row][0] == GB_results.loc[row][1]:
        GB_correct_predictions +=1
        
(GB_correct_predictions / len(RF_results))*100

8.49867048476171

In [19]:
GB.score(X_test, y_test)

0.0849867048476171

In [None]:
# model = GradientBoostingRegressor()

# grid = {'max_depth': [2,4,8],
#         'min_samples_split': [2, 4, 8],
#        'subsample': [1, 0.5, 0.25, 0.1]}

# gb_gridsearch = GridSearchCV(estimator=model, param_grid=grid, 
#                              cv=5, verbose=0, return_train_score=True)

# gb_gridsearch.fit(X_train, y_train)

# # and after some hours...
# df_gridsearch = pd.DataFrame(gb_gridsearch.cv_results_)

In [21]:
gb_gridsearch = GridSearchCV(estimator=GradientBoostingClassifier(),
                                param_grid = {'max_depth': [2, 4, 8],
                                             'min_samples_split': [2, 4, 8],
                                             'subsample': [1, 0.5, 0.25, 0.1]},
                                cv=5, verbose=0, return_train_score=True)
gb_gridsearch.fit(X_train, y_train)
df_gridsearch = pd.DataFrame(gb_gridsearch.cv_results_)

In [25]:
gb_gridsearch.best_params_

{'max_depth': 2, 'min_samples_split': 2, 'subsample': 0.5}

In [26]:
gb_gridsearch.best_score_

0.08915432805330857