In [1]:
import pandas as pd
import os

from sklearn.model_selection import train_test_split
from sklearn.svm import SVC
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV
from sklearn.metrics import classification_report

from sklearn.linear_model import SGDClassifier
import scipy.stats as stats
from sklearn.utils.fixes import loguniform


In [2]:
# Load csv into dataframe
divorce_df = pd.read_csv(os.path.join("divorce.csv"))
divorce_df.head()

Unnamed: 0,Atr1,Atr2,Atr3,Atr4,Atr5,Atr6,Atr7,Atr8,Atr9,Atr10,...,Atr46,Atr47,Atr48,Atr49,Atr50,Atr51,Atr52,Atr53,Atr54,Class
0,2,2,4,1,0,0,0,0,0,0,...,2,1,3,3,3,2,3,2,1,1
1,4,4,4,4,4,0,0,4,4,4,...,2,2,3,4,4,4,4,2,2,1
2,2,2,2,2,1,3,2,1,1,2,...,3,2,3,1,1,1,2,2,2,1
3,3,2,3,2,3,3,3,3,3,3,...,2,2,3,3,3,3,2,2,2,1
4,2,2,1,1,1,1,0,0,0,0,...,2,1,2,3,2,2,2,1,0,1


In [3]:
# Define target, names
target = divorce_df["Class"]
target_names = ["negative", "positive"]

In [4]:
# Define feeatures, save to DataFrame
features = divorce_df.drop("Class", axis=1)
feature_names = divorce_df.columns
features.head()

Unnamed: 0,Atr1,Atr2,Atr3,Atr4,Atr5,Atr6,Atr7,Atr8,Atr9,Atr10,...,Atr45,Atr46,Atr47,Atr48,Atr49,Atr50,Atr51,Atr52,Atr53,Atr54
0,2,2,4,1,0,0,0,0,0,0,...,3,2,1,3,3,3,2,3,2,1
1,4,4,4,4,4,0,0,4,4,4,...,2,2,2,3,4,4,4,4,2,2
2,2,2,2,2,1,3,2,1,1,2,...,2,3,2,3,1,1,1,2,2,2
3,3,2,3,2,3,3,3,3,3,3,...,3,2,2,3,3,3,3,2,2,2
4,2,2,1,1,1,1,0,0,0,0,...,2,2,1,2,3,2,2,2,1,0


In [5]:
# Train, test, split data
X_train, X_test, y_train, y_test = train_test_split(features, target, random_state=36)

In [7]:
# Run feature importance on train and test data
from sklearn.ensemble import RandomForestClassifier
rf = RandomForestClassifier(n_estimators=200)
rf = rf.fit(X_train, y_train)
rf.score(X_test, y_test)

0.9767441860465116

In [8]:
# Print important features, highest to lowest
sorted(zip(rf.feature_importances_, feature_names), reverse=True)

[(0.09729877083287948, 'Atr9'),
 (0.09019939511930733, 'Atr18'),
 (0.0815636011949421, 'Atr40'),
 (0.06800564356967649, 'Atr11'),
 (0.059182961130586756, 'Atr20'),
 (0.05912276832873846, 'Atr16'),
 (0.049282514731709734, 'Atr17'),
 (0.04864237077300978, 'Atr21'),
 (0.04661316117465863, 'Atr12'),
 (0.046457565631996746, 'Atr19'),
 (0.035091278825206455, 'Atr26'),
 (0.031952007820190524, 'Atr29'),
 (0.029314199515939547, 'Atr15'),
 (0.028554604473889474, 'Atr14'),
 (0.027787232229652983, 'Atr25'),
 (0.027012041767601934, 'Atr36'),
 (0.02380105375663467, 'Atr30'),
 (0.021712195998229985, 'Atr39'),
 (0.020720536612762956, 'Atr27'),
 (0.014815167222507995, 'Atr5'),
 (0.00971486796393203, 'Atr28'),
 (0.009508664150223342, 'Atr8'),
 (0.008489289914252827, 'Atr35'),
 (0.007886627922929776, 'Atr31'),
 (0.007272976512478587, 'Atr3'),
 (0.007248552972401612, 'Atr38'),
 (0.0066365828773617755, 'Atr41'),
 (0.005191603766158699, 'Atr37'),
 (0.00503928578178992, 'Atr33'),
 (0.004325869251117314, 'Atr

In [39]:
# Re-save features DataFrame with top five factors
features = divorce_df[["Atr9", "Atr18", "Atr40", "Atr11", "Atr20"]]
features

Unnamed: 0,Atr9,Atr18,Atr40,Atr11,Atr20
0,0,0,3,1,1
1,4,4,4,4,2
2,1,3,3,3,2
3,3,3,4,4,4
4,0,1,1,0,1
...,...,...,...,...,...
165,0,0,0,0,0
166,0,0,1,0,0
167,0,0,2,0,1
168,0,0,1,0,0


In [40]:
# Train, test, split data from updated DataFrame

X_train, X_test, y_train, y_test = train_test_split(features, target, random_state=36)

In [41]:
# Create instance of SVM linear classifier
model = SVC(kernel='linear')
model

SVC(C=1.0, break_ties=False, cache_size=200, class_weight=None, coef0=0.0,
    decision_function_shape='ovr', degree=3, gamma='scale', kernel='linear',
    max_iter=-1, probability=False, random_state=None, shrinking=True,
    tol=0.001, verbose=False)

## Grid Search

In [42]:
# Create GridSearch estimator 
param_grid = {'C': [1, 5, 10],
              'gamma': [0.0001, 0.001, 0.01]}
grid = GridSearchCV(model, param_grid, verbose=2)

In [43]:
# Fit the model using GridSearch estimator
grid.fit(X_train, y_train)

Fitting 5 folds for each of 9 candidates, totalling 45 fits
[CV] C=1, gamma=0.0001 ...............................................
[CV] ................................ C=1, gamma=0.0001, total=   0.0s
[CV] C=1, gamma=0.0001 ...............................................
[CV] ................................ C=1, gamma=0.0001, total=   0.0s
[CV] C=1, gamma=0.0001 ...............................................
[CV] ................................ C=1, gamma=0.0001, total=   0.0s
[CV] C=1, gamma=0.0001 ...............................................
[CV] ................................ C=1, gamma=0.0001, total=   0.0s
[CV] C=1, gamma=0.0001 ...............................................
[CV] ................................ C=1, gamma=0.0001, total=   0.0s
[CV] C=1, gamma=0.001 ................................................
[CV] ................................. C=1, gamma=0.001, total=   0.0s
[CV] C=1, gamma=0.001 ................................................
[CV] ............

[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    0.0s remaining:    0.0s


[CV] ................................ C=10, gamma=0.001, total=   0.0s
[CV] C=10, gamma=0.001 ...............................................
[CV] ................................ C=10, gamma=0.001, total=   0.0s
[CV] C=10, gamma=0.001 ...............................................
[CV] ................................ C=10, gamma=0.001, total=   0.0s
[CV] C=10, gamma=0.01 ................................................
[CV] ................................. C=10, gamma=0.01, total=   0.0s
[CV] C=10, gamma=0.01 ................................................
[CV] ................................. C=10, gamma=0.01, total=   0.0s
[CV] C=10, gamma=0.01 ................................................
[CV] ................................. C=10, gamma=0.01, total=   0.0s
[CV] C=10, gamma=0.01 ................................................
[CV] ................................. C=10, gamma=0.01, total=   0.0s
[CV] C=10, gamma=0.01 ................................................
[CV] .

[Parallel(n_jobs=1)]: Done  45 out of  45 | elapsed:    0.2s finished


GridSearchCV(cv=None, error_score=nan,
             estimator=SVC(C=1.0, break_ties=False, cache_size=200,
                           class_weight=None, coef0=0.0,
                           decision_function_shape='ovr', degree=3,
                           gamma='scale', kernel='linear', max_iter=-1,
                           probability=False, random_state=None, shrinking=True,
                           tol=0.001, verbose=False),
             iid='deprecated', n_jobs=None,
             param_grid={'C': [1, 5, 10], 'gamma': [0.0001, 0.001, 0.01]},
             pre_dispatch='2*n_jobs', refit=True, return_train_score=False,
             scoring=None, verbose=2)

In [44]:
# Print best parameters
print(grid.best_params_)

{'C': 1, 'gamma': 0.0001}


In [45]:
# Print best score
print(grid.best_score_)

0.984


In [46]:
# Make predictions with hypertuned model
predictions = grid.predict(X_test)

In [47]:
# Calculate predictions report
print(classification_report(y_test, predictions, target_names=["divorced", "married"]))

              precision    recall  f1-score   support

    divorced       0.95      1.00      0.98        21
     married       1.00      0.95      0.98        22

    accuracy                           0.98        43
   macro avg       0.98      0.98      0.98        43
weighted avg       0.98      0.98      0.98        43



## Same process but with researcher's top attributes

In [54]:
# 7, 43, 46, 48, 52
study_features = divorce_df[["Atr7", "Atr43", "Atr46", "Atr48", "Atr52"]]
features

Unnamed: 0,Atr9,Atr18,Atr40,Atr11,Atr20
0,0,0,3,1,1
1,4,4,4,4,2
2,1,3,3,3,2
3,3,3,4,4,4
4,0,1,1,0,1
...,...,...,...,...,...
165,0,0,0,0,0
166,0,0,1,0,0
167,0,0,2,0,1
168,0,0,1,0,0


In [58]:
# Train, test, split data from study_features DataFrame

X_train2, X_test2, y_train2, y_test2 = train_test_split(study_features, target, random_state=36)

In [59]:
# Create instance of SVM linear classifier
model2 = SVC(kernel='linear')
model2

SVC(C=1.0, break_ties=False, cache_size=200, class_weight=None, coef0=0.0,
    decision_function_shape='ovr', degree=3, gamma='scale', kernel='linear',
    max_iter=-1, probability=False, random_state=None, shrinking=True,
    tol=0.001, verbose=False)

## Grid Search

In [64]:
# Create GridSearch estimator 
param_grid2 = {'C': [1, 5, 10],
              'gamma': [0.0001, 0.001, 0.01]}
grid2 = GridSearchCV(model2, param_grid2, verbose=2)

In [65]:
# Fit the model using GridSearch estimator
grid2.fit(X_train2, y_train2)

Fitting 5 folds for each of 9 candidates, totalling 45 fits
[CV] C=1, gamma=0.0001 ...............................................
[CV] ................................ C=1, gamma=0.0001, total=   0.0s
[CV] C=1, gamma=0.0001 ...............................................
[CV] ................................ C=1, gamma=0.0001, total=   0.0s
[CV] C=1, gamma=0.0001 ...............................................
[CV] ................................ C=1, gamma=0.0001, total=   0.0s
[CV] C=1, gamma=0.0001 ...............................................
[CV] ................................ C=1, gamma=0.0001, total=   0.0s
[CV] C=1, gamma=0.0001 ...............................................
[CV] ................................ C=1, gamma=0.0001, total=   0.0s
[CV] C=1, gamma=0.001 ................................................
[CV] ................................. C=1, gamma=0.001, total=   0.0s
[CV] C=1, gamma=0.001 ................................................
[CV] ............

[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    0.0s remaining:    0.0s


[CV] ................................ C=10, gamma=0.001, total=   0.0s
[CV] C=10, gamma=0.01 ................................................
[CV] ................................. C=10, gamma=0.01, total=   0.0s
[CV] C=10, gamma=0.01 ................................................
[CV] ................................. C=10, gamma=0.01, total=   0.0s
[CV] C=10, gamma=0.01 ................................................
[CV] ................................. C=10, gamma=0.01, total=   0.0s
[CV] C=10, gamma=0.01 ................................................
[CV] ................................. C=10, gamma=0.01, total=   0.0s
[CV] C=10, gamma=0.01 ................................................
[CV] ................................. C=10, gamma=0.01, total=   0.0s


[Parallel(n_jobs=1)]: Done  45 out of  45 | elapsed:    0.2s finished


GridSearchCV(cv=None, error_score=nan,
             estimator=SVC(C=1.0, break_ties=False, cache_size=200,
                           class_weight=None, coef0=0.0,
                           decision_function_shape='ovr', degree=3,
                           gamma='scale', kernel='linear', max_iter=-1,
                           probability=False, random_state=None, shrinking=True,
                           tol=0.001, verbose=False),
             iid='deprecated', n_jobs=None,
             param_grid={'C': [1, 5, 10], 'gamma': [0.0001, 0.001, 0.01]},
             pre_dispatch='2*n_jobs', refit=True, return_train_score=False,
             scoring=None, verbose=2)

In [71]:
# Print best parameters
print(grid2.best_params_)

{'C': 1, 'gamma': 0.0001}


In [72]:
# Print best score
print(grid2.best_score_)

0.912923076923077


In [73]:
# Make predictions with hypertuned model
predictions2 = grid2.predict(X_test2)

In [74]:
# Calculate predictions report
print(classification_report(y_test2, predictions2, target_names=["divorced", "married"]))

              precision    recall  f1-score   support

    divorced       0.95      0.95      0.95        21
     married       0.95      0.95      0.95        22

    accuracy                           0.95        43
   macro avg       0.95      0.95      0.95        43
weighted avg       0.95      0.95      0.95        43

