In [152]:
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix
from sklearn.metrics import f1_score
from sklearn.metrics import ConfusionMatrixDisplay
import pandas as pd
from sklearn.preprocessing import StandardScaler
import numpy as np

In [157]:
train_set = pd.read_csv('train_data.csv')
test_set = pd.read_csv('test_data.csv')
train_set.drop(['Unnamed: 0', 'zipcode', 'from', 'positin1', 'undergra', 'income', 'mn_sat', 'tuition', 'id', 'iid'], inplace=True, axis=1)
test_set.drop(['Unnamed: 0', 'zipcode', 'from', 'positin1', 'undergra', 'income', 'mn_sat', 'tuition', 'id', 'iid'], inplace=True, axis=1)
X_train = train_set.drop(['match'], axis=1)
columns = X_train.columns
y_train = train_set['match']
X_test = test_set.drop(['match'], axis=1)
y_test = test_set['match']

In [159]:
#Standarization
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
#X_test = scaler.transform(X_test) # NOT FIT TRANSFORM
print(np.std(X_train))
print(abs(round(np.mean(X_train), 17)))

0.9999999999999999
1e-17


In [160]:
#Before removing dec
log_reg = LogisticRegression(C = 1.0, penalty = "l2",solver = 'liblinear')

scores = cross_val_score(log_reg, X_train, y_train, cv=5, n_jobs=-1, scoring='f1')

print(scores)
print(scores.mean())
print(scores.std())

[0.98123324 0.42020666 0.98918919 1.         0.99186992]
0.8764998021737668
0.2282253215232788


In [161]:
importances = log_reg.fit(X_train, y_train).coef_
imp_df = pd.DataFrame({'Feature': columns, 'Coefficient': importances.flatten()})
imp_df.sort_values(ascending = False, by = 'Coefficient', inplace = True)
print(imp_df)

     Feature  Coefficient
19     dec_o     4.980671
85       dec     4.932351
86      attr     0.152839
20    attr_o     0.135967
140  sinc5_2     0.091390
..       ...          ...
67    amb4_1    -0.061204
90       amb    -0.061579
105   fun3_s    -0.084525
2     condtn    -0.087471
127  shar4_2    -0.088011

[183 rows x 2 columns]


In [162]:
train_set.drop(['dec', 'dec_o'], inplace=True, axis=1)
test_set.drop(['dec', 'dec_o'], inplace=True, axis=1)



In [168]:
X_train = train_set.drop(['match'], axis=1)
X_test = test_set.drop(['match'], axis=1)
columns = X_train.columns

In [164]:
#First attempt
log_reg = LogisticRegression(C = 1.0, penalty = "l2",solver = 'liblinear')

scores = cross_val_score(log_reg, X_train, y_train, cv=5, n_jobs=-1, scoring='f1')

print(scores)
print(scores.mean())
print(scores.std())

[0.52193995 0.33401222 0.42857143 0.37878788 0.2662116 ]
0.38590461704495216
0.08648231708552474


In [169]:
importances = log_reg.fit(X_train, y_train).coef_
imp_df = pd.DataFrame({'Feature': columns, 'Coefficient': importances.flatten()})
imp_df.sort_values(ascending = False, by = 'Coefficient', inplace = True)

In [171]:
imp_df

Unnamed: 0,Feature,Coefficient
9,int_corr,0.379655
90,like,0.354235
25,like_o,0.314818
146,num_in_3,0.305976
19,attr_o,0.269968
...,...,...
134,intel3_2,-0.157824
10,samerace,-0.178568
177,sinc5_3,-0.182714
103,fun3_s,-0.193990


In [173]:
#Hyperparameter tuning
param_grid = {"C":[0.001, 0.01, 0.1, 0.5, 1.0, 10.0, 20.0, 50.0], "penalty": ["l1", "l2"], "solver": ["liblinear"]}
#param_grid = {"C":[0.1, 0.5, 1.0, 10.0, 20.0, 50.0], "penalty": ["l2"], "solver": ["liblinear"]}
log_reg_clf = LogisticRegression()


grid_search = GridSearchCV(log_reg_clf, param_grid, cv=5, n_jobs=-1, scoring='f1')
grid_search.fit(X_train, y_train)

cv_res = grid_search.cv_results_

for mean_score, params in zip(cv_res['mean_test_score'], cv_res['params']):
    print(mean_score, params)
    

0.0 {'C': 0.001, 'penalty': 'l1', 'solver': 'liblinear'}
0.3594082305926723 {'C': 0.001, 'penalty': 'l2', 'solver': 'liblinear'}
0.38787107204381704 {'C': 0.01, 'penalty': 'l1', 'solver': 'liblinear'}
0.39076293516522675 {'C': 0.01, 'penalty': 'l2', 'solver': 'liblinear'}
0.4175543246065822 {'C': 0.1, 'penalty': 'l1', 'solver': 'liblinear'}
0.3878046854741757 {'C': 0.1, 'penalty': 'l2', 'solver': 'liblinear'}
0.4072593164278307 {'C': 0.5, 'penalty': 'l1', 'solver': 'liblinear'}
0.39130076249194 {'C': 0.5, 'penalty': 'l2', 'solver': 'liblinear'}
0.40389690765330155 {'C': 1.0, 'penalty': 'l1', 'solver': 'liblinear'}
0.38590461704495216 {'C': 1.0, 'penalty': 'l2', 'solver': 'liblinear'}
0.3974601518236349 {'C': 10.0, 'penalty': 'l1', 'solver': 'liblinear'}
0.3797578514163858 {'C': 10.0, 'penalty': 'l2', 'solver': 'liblinear'}
0.39396843665802744 {'C': 20.0, 'penalty': 'l1', 'solver': 'liblinear'}
0.37823581714140464 {'C': 20.0, 'penalty': 'l2', 'solver': 'liblinear'}
0.39394843385769857 {

In [176]:
#Hyperparameter tuning - version with balanced classes
param_grid = {"C":[0.001, 0.01, 0.1, 0.5, 1.0, 10.0, 20.0, 50.0], "penalty": ["l1", "l2"], "solver": ["liblinear"], "class_weight":['balanced']}
#param_grid = {"C":[0.1, 0.5, 1.0, 10.0, 20.0, 50.0], "penalty": ["l2"], "solver": ["liblinear"]}
log_reg_clf = LogisticRegression()


grid_search = GridSearchCV(log_reg_clf, param_grid, cv=5, n_jobs=-1, scoring='f1')
grid_search.fit(X_train, y_train)

cv_res = grid_search.cv_results_

for mean_score, params in zip(cv_res['mean_test_score'], cv_res['params']):
    print(mean_score, params)

0.4310632207533788 {'C': 0.001, 'class_weight': 'balanced', 'penalty': 'l1', 'solver': 'liblinear'}
0.45228360434858683 {'C': 0.001, 'class_weight': 'balanced', 'penalty': 'l2', 'solver': 'liblinear'}
0.47807750012368794 {'C': 0.01, 'class_weight': 'balanced', 'penalty': 'l1', 'solver': 'liblinear'}
0.4343943790565207 {'C': 0.01, 'class_weight': 'balanced', 'penalty': 'l2', 'solver': 'liblinear'}
0.4458570139623873 {'C': 0.1, 'class_weight': 'balanced', 'penalty': 'l1', 'solver': 'liblinear'}
0.4231803311107498 {'C': 0.1, 'class_weight': 'balanced', 'penalty': 'l2', 'solver': 'liblinear'}
0.42897216569258934 {'C': 0.5, 'class_weight': 'balanced', 'penalty': 'l1', 'solver': 'liblinear'}
0.42330112279166954 {'C': 0.5, 'class_weight': 'balanced', 'penalty': 'l2', 'solver': 'liblinear'}
0.42658323826704825 {'C': 1.0, 'class_weight': 'balanced', 'penalty': 'l1', 'solver': 'liblinear'}
0.4242951232940162 {'C': 1.0, 'class_weight': 'balanced', 'penalty': 'l2', 'solver': 'liblinear'}
0.4299530

In [None]:
#Only most correlated features
target = 'match'
corr_with_target = train_set.corr(method='spearman')[target]
top_corr = corr_with_target.drop(target).abs().sort_values(ascending=False).head(40)

train_set = pd.read_csv('train_data.csv')
test_set = pd.read_csv('test_data.csv')
train_set.drop(['Unnamed: 0', 'zipcode', 'from', 'positin1', 'undergra', 'income', 'mn_sat', 'tuition', 'id', 'iid'], inplace=True, axis=1)
test_set.drop(['Unnamed: 0', 'zipcode', 'from', 'positin1', 'undergra', 'income', 'mn_sat', 'tuition', 'id', 'iid'], inplace=True, axis=1)
X_train = train_set.drop(['match'], axis=1)
X_train = X_train[np.array(top_corr.index)]
columns = X_train.columns
y_train = train_set['match']
X_test = test_set.drop(['match'], axis=1)
X_test = X_test[np.array(top_corr.index)]
y_test = test_set['match']
#Standarization
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
#X_test = scaler.transform(X_test) # NOT FIT TRANSFORM
print(np.std(X_train))
print(abs(round(np.mean(X_train), 17)))



In [106]:
log_reg = LogisticRegression(...)
importances = log_reg.fit(X_train, y_train).coef_
imp_df = pd.DataFrame({'Feature': columns, 'Coefficient': importances.flatten()})
imp_df.sort_values(ascending = False, by = 'Coefficient', inplace = True)
print(imp_df)

array(['like', 'like_o', 'fun_o', 'fun', 'shar_o', 'shar', 'attr',
       'attr_o', 'prob', 'prob_o', 'intel', 'intel_o', 'sinc_o', 'sinc',
       'amb', 'amb_o', 'met_o', 'them_cal', 'date_3', 'match_es',
       'numdat_3', 'num_in_3', 'clubbing', 'fun3_2', 'satis_2', 'sinc4_1',
       'condtn', 'fun1_2', 'date', 'go_out', 'imprace', 'fun3_1',
       'shar1_1', 'you_call', 'pf_o_sha', 'int_corr', 'expnum', 'age',
       'art', 'age_o'], dtype=object)

In [None]:
log_reg = LogisticRegression(...)
log_reg.fit(X_train, y_train)
y_pred = log_reg(X_test)
score = f1_score (y_test, y_pred)
print(score)
ConfusionMatrixDisplay(log_reg, X_test, y_test)