In [2]:
import pandas as pd
import os

from sklearn.model_selection import train_test_split
from sklearn.svm import SVC
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV
from sklearn.metrics import classification_report

from sklearn.linear_model import SGDClassifier
import scipy.stats as stats
from sklearn.utils.fixes import loguniform


In [4]:
# Load csv into dataframe
divorce_df = pd.read_csv(os.path.join("../Data/divorce.csv"))
divorce_df.head()

Unnamed: 0,Atr1,Atr2,Atr3,Atr4,Atr5,Atr6,Atr7,Atr8,Atr9,Atr10,...,Atr46,Atr47,Atr48,Atr49,Atr50,Atr51,Atr52,Atr53,Atr54,Class
0,2,2,4,1,0,0,0,0,0,0,...,2,1,3,3,3,2,3,2,1,1
1,4,4,4,4,4,0,0,4,4,4,...,2,2,3,4,4,4,4,2,2,1
2,2,2,2,2,1,3,2,1,1,2,...,3,2,3,1,1,1,2,2,2,1
3,3,2,3,2,3,3,3,3,3,3,...,2,2,3,3,3,3,2,2,2,1
4,2,2,1,1,1,1,0,0,0,0,...,2,1,2,3,2,2,2,1,0,1


## SVM with GridSearch using all features

In [9]:
# Define target, names
target = divorce_df["Class"]
target_names = ["negative", "positive"]

In [10]:
# Define feeatures, save to DataFrame
features = divorce_df.drop("Class", axis=1)
feature_names = divorce_df.columns
features.head()

Unnamed: 0,Atr1,Atr2,Atr3,Atr4,Atr5,Atr6,Atr7,Atr8,Atr9,Atr10,...,Atr45,Atr46,Atr47,Atr48,Atr49,Atr50,Atr51,Atr52,Atr53,Atr54
0,2,2,4,1,0,0,0,0,0,0,...,3,2,1,3,3,3,2,3,2,1
1,4,4,4,4,4,0,0,4,4,4,...,2,2,2,3,4,4,4,4,2,2
2,2,2,2,2,1,3,2,1,1,2,...,2,3,2,3,1,1,1,2,2,2
3,3,2,3,2,3,3,3,3,3,3,...,3,2,2,3,3,3,3,2,2,2
4,2,2,1,1,1,1,0,0,0,0,...,2,2,1,2,3,2,2,2,1,0


In [11]:
# Train, test, split data
X_train, X_test, y_train, y_test = train_test_split(features, target, random_state=36)

In [12]:
# Create instance of SVM linear classifier
model = SVC(kernel='linear')
model

SVC(C=1.0, break_ties=False, cache_size=200, class_weight=None, coef0=0.0,
    decision_function_shape='ovr', degree=3, gamma='scale', kernel='linear',
    max_iter=-1, probability=False, random_state=None, shrinking=True,
    tol=0.001, verbose=False)

In [13]:
# Create GridSearch estimator 
param_grid = {'C': [1, 5, 10],
              'gamma': [0.0001, 0.001, 0.01]}
grid = GridSearchCV(model, param_grid, verbose=2)

In [14]:
# Fit the model using GridSearch estimator
grid.fit(X_train, y_train)

Fitting 5 folds for each of 9 candidates, totalling 45 fits
[CV] C=1, gamma=0.0001 ...............................................
[CV] ................................ C=1, gamma=0.0001, total=   0.0s
[CV] C=1, gamma=0.0001 ...............................................
[CV] ................................ C=1, gamma=0.0001, total=   0.0s
[CV] C=1, gamma=0.0001 ...............................................
[CV] ................................ C=1, gamma=0.0001, total=   0.0s
[CV] C=1, gamma=0.0001 ...............................................
[CV] ................................ C=1, gamma=0.0001, total=   0.0s
[CV] C=1, gamma=0.0001 ...............................................
[CV] ................................ C=1, gamma=0.0001, total=   0.0s
[CV] C=1, gamma=0.001 ................................................
[CV] ................................. C=1, gamma=0.001, total=   0.0s
[CV] C=1, gamma=0.001 ................................................
[CV] ............

[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    0.0s remaining:    0.0s


[CV] ................................ C=10, gamma=0.001, total=   0.0s
[CV] C=10, gamma=0.01 ................................................
[CV] ................................. C=10, gamma=0.01, total=   0.0s
[CV] C=10, gamma=0.01 ................................................
[CV] ................................. C=10, gamma=0.01, total=   0.0s
[CV] C=10, gamma=0.01 ................................................
[CV] ................................. C=10, gamma=0.01, total=   0.0s
[CV] C=10, gamma=0.01 ................................................
[CV] ................................. C=10, gamma=0.01, total=   0.0s
[CV] C=10, gamma=0.01 ................................................
[CV] ................................. C=10, gamma=0.01, total=   0.0s


[Parallel(n_jobs=1)]: Done  45 out of  45 | elapsed:    0.2s finished


GridSearchCV(cv=None, error_score=nan,
             estimator=SVC(C=1.0, break_ties=False, cache_size=200,
                           class_weight=None, coef0=0.0,
                           decision_function_shape='ovr', degree=3,
                           gamma='scale', kernel='linear', max_iter=-1,
                           probability=False, random_state=None, shrinking=True,
                           tol=0.001, verbose=False),
             iid='deprecated', n_jobs=None,
             param_grid={'C': [1, 5, 10], 'gamma': [0.0001, 0.001, 0.01]},
             pre_dispatch='2*n_jobs', refit=True, return_train_score=False,
             scoring=None, verbose=2)

In [15]:
# Make predictions with hypertuned model
predictions = grid.predict(X_test)

In [16]:
# Calculate predictions report
print(classification_report(y_test, predictions, target_names=["divorced", "married"]))

              precision    recall  f1-score   support

    divorced       0.95      1.00      0.98        21
     married       1.00      0.95      0.98        22

    accuracy                           0.98        43
   macro avg       0.98      0.98      0.98        43
weighted avg       0.98      0.98      0.98        43



## SVM with GridSearch using top 5 features

In [17]:
# Run feature importance on train and test data
from sklearn.ensemble import RandomForestClassifier
rf = RandomForestClassifier(n_estimators=200)
rf = rf.fit(X_train, y_train)
rf.score(X_test, y_test)

0.9767441860465116

In [18]:
# Print important features, highest to lowest
sorted(zip(rf.feature_importances_, feature_names), reverse=True)

[(0.11953183746329597, 'Atr18'),
 (0.10644016515979515, 'Atr40'),
 (0.08194190713887861, 'Atr16'),
 (0.0655788456012378, 'Atr17'),
 (0.06393243336494328, 'Atr11'),
 (0.05833282008988331, 'Atr36'),
 (0.0550884488550326, 'Atr9'),
 (0.04143107804060849, 'Atr26'),
 (0.039061830015538174, 'Atr19'),
 (0.03599689002481593, 'Atr30'),
 (0.03443673982865389, 'Atr15'),
 (0.027909809271910432, 'Atr20'),
 (0.027693170667656204, 'Atr14'),
 (0.023816586634742367, 'Atr25'),
 (0.023262342833917798, 'Atr38'),
 (0.02264065443552333, 'Atr29'),
 (0.021398624669878033, 'Atr39'),
 (0.020719404724168094, 'Atr12'),
 (0.018805686218042565, 'Atr37'),
 (0.01752841303432543, 'Atr35'),
 (0.01413980478115888, 'Atr28'),
 (0.009842428200264172, 'Atr10'),
 (0.007671393900986244, 'Atr27'),
 (0.006133398799396382, 'Atr8'),
 (0.006108950239697189, 'Atr41'),
 (0.005570745716128831, 'Atr34'),
 (0.005173169024168109, 'Atr5'),
 (0.004924916872341048, 'Atr31'),
 (0.004901741108531918, 'Atr21'),
 (0.004861935255078908, 'Atr22')

In [28]:
# Re-save features DataFrame with top five factors
features2 = divorce_df[["Atr18", "Atr40", "Atr16", "Atr17", "Atr11"]]
features2

Unnamed: 0,Atr18,Atr40,Atr16,Atr17,Atr11
0,0,3,1,0,1
1,4,4,4,4,4
2,3,3,3,3,3
3,3,4,3,3,4
4,1,1,1,1,0
...,...,...,...,...,...
165,0,0,0,0,0
166,0,1,0,0,0
167,0,2,1,0,0
168,0,1,0,0,0


In [29]:
# Train, test, split data from updated DataFrame

X_train2, X_test2, y_train2, y_test2 = train_test_split(features2, target, random_state=36)

In [30]:
# Create instance of SVM linear classifier
model2 = SVC(kernel='linear')
model2

SVC(C=1.0, break_ties=False, cache_size=200, class_weight=None, coef0=0.0,
    decision_function_shape='ovr', degree=3, gamma='scale', kernel='linear',
    max_iter=-1, probability=False, random_state=None, shrinking=True,
    tol=0.001, verbose=False)

In [31]:
# Create GridSearch estimator 
param_grid = {'C': [1, 5, 10],
              'gamma': [0.0001, 0.001, 0.01]}
grid2 = GridSearchCV(model2, param_grid, verbose=2)

In [32]:
# Fit the model using GridSearch estimator
grid2.fit(X_train2, y_train2)

Fitting 5 folds for each of 9 candidates, totalling 45 fits
[CV] C=1, gamma=0.0001 ...............................................
[CV] ................................ C=1, gamma=0.0001, total=   0.0s
[CV] C=1, gamma=0.0001 ...............................................
[CV] ................................ C=1, gamma=0.0001, total=   0.0s
[CV] C=1, gamma=0.0001 ...............................................
[CV] ................................ C=1, gamma=0.0001, total=   0.0s
[CV] C=1, gamma=0.0001 ...............................................
[CV] ................................ C=1, gamma=0.0001, total=   0.0s
[CV] C=1, gamma=0.0001 ...............................................
[CV] ................................ C=1, gamma=0.0001, total=   0.0s
[CV] C=1, gamma=0.001 ................................................
[CV] ................................. C=1, gamma=0.001, total=   0.0s
[CV] C=1, gamma=0.001 ................................................
[CV] ............

[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    0.0s remaining:    0.0s



[CV] C=10, gamma=0.01 ................................................
[CV] ................................. C=10, gamma=0.01, total=   0.0s
[CV] C=10, gamma=0.01 ................................................
[CV] ................................. C=10, gamma=0.01, total=   0.0s
[CV] C=10, gamma=0.01 ................................................
[CV] ................................. C=10, gamma=0.01, total=   0.0s
[CV] C=10, gamma=0.01 ................................................
[CV] ................................. C=10, gamma=0.01, total=   0.0s


[Parallel(n_jobs=1)]: Done  45 out of  45 | elapsed:    0.2s finished


GridSearchCV(cv=None, error_score=nan,
             estimator=SVC(C=1.0, break_ties=False, cache_size=200,
                           class_weight=None, coef0=0.0,
                           decision_function_shape='ovr', degree=3,
                           gamma='scale', kernel='linear', max_iter=-1,
                           probability=False, random_state=None, shrinking=True,
                           tol=0.001, verbose=False),
             iid='deprecated', n_jobs=None,
             param_grid={'C': [1, 5, 10], 'gamma': [0.0001, 0.001, 0.01]},
             pre_dispatch='2*n_jobs', refit=True, return_train_score=False,
             scoring=None, verbose=2)

In [33]:
# Print best parameters
print(grid2.best_params_)

{'C': 1, 'gamma': 0.0001}


In [34]:
# Print best score
print(grid2.best_score_)

0.984


In [35]:
# Make predictions with hypertuned model
predictions2 = grid2.predict(X_test2)

In [36]:
# Calculate predictions report
print(classification_report(y_test2, predictions2, target_names=["divorced", "married"]))

              precision    recall  f1-score   support

    divorced       0.95      1.00      0.98        21
     married       1.00      0.95      0.98        22

    accuracy                           0.98        43
   macro avg       0.98      0.98      0.98        43
weighted avg       0.98      0.98      0.98        43



## Same process but with researcher's top attributes

In [37]:
# 7, 43, 46, 48, 52
study_features = divorce_df[["Atr7", "Atr43", "Atr46", "Atr48", "Atr52"]]
study_features

Unnamed: 0,Atr7,Atr43,Atr46,Atr48,Atr52
0,0,1,2,3,3
1,0,3,2,3,4
2,2,2,3,3,2
3,3,3,2,3,2
4,0,3,2,2,2
...,...,...,...,...,...
165,0,0,1,4,2
166,0,3,4,2,3
167,0,3,3,2,3
168,0,1,3,2,4


In [38]:
# Train, test, split data from study_features DataFrame

X_train3, X_test3, y_train3, y_test3 = train_test_split(study_features, target, random_state=36)

In [39]:
# Create instance of SVM linear classifier
model3 = SVC(kernel='linear')
model3

SVC(C=1.0, break_ties=False, cache_size=200, class_weight=None, coef0=0.0,
    decision_function_shape='ovr', degree=3, gamma='scale', kernel='linear',
    max_iter=-1, probability=False, random_state=None, shrinking=True,
    tol=0.001, verbose=False)

## Grid Search

In [40]:
# Create GridSearch estimator 

grid3 = GridSearchCV(model3, param_grid, verbose=2)

In [41]:
# Fit the model using GridSearch estimator
grid3.fit(X_train3, y_train3)

Fitting 5 folds for each of 9 candidates, totalling 45 fits
[CV] C=1, gamma=0.0001 ...............................................
[CV] ................................ C=1, gamma=0.0001, total=   0.0s
[CV] C=1, gamma=0.0001 ...............................................
[CV] ................................ C=1, gamma=0.0001, total=   0.0s
[CV] C=1, gamma=0.0001 ...............................................
[CV] ................................ C=1, gamma=0.0001, total=   0.0s
[CV] C=1, gamma=0.0001 ...............................................
[CV] ................................ C=1, gamma=0.0001, total=   0.0s
[CV] C=1, gamma=0.0001 ...............................................
[CV] ................................ C=1, gamma=0.0001, total=   0.0s
[CV] C=1, gamma=0.001 ................................................
[CV] ................................. C=1, gamma=0.001, total=   0.0s
[CV] C=1, gamma=0.001 ................................................
[CV] ............

[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    0.0s remaining:    0.0s


[CV] ................................. C=10, gamma=0.01, total=   0.0s
[CV] C=10, gamma=0.01 ................................................
[CV] ................................. C=10, gamma=0.01, total=   0.0s
[CV] C=10, gamma=0.01 ................................................
[CV] ................................. C=10, gamma=0.01, total=   0.0s
[CV] C=10, gamma=0.01 ................................................
[CV] ................................. C=10, gamma=0.01, total=   0.0s
[CV] C=10, gamma=0.01 ................................................
[CV] ................................. C=10, gamma=0.01, total=   0.0s


[Parallel(n_jobs=1)]: Done  45 out of  45 | elapsed:    0.2s finished


GridSearchCV(cv=None, error_score=nan,
             estimator=SVC(C=1.0, break_ties=False, cache_size=200,
                           class_weight=None, coef0=0.0,
                           decision_function_shape='ovr', degree=3,
                           gamma='scale', kernel='linear', max_iter=-1,
                           probability=False, random_state=None, shrinking=True,
                           tol=0.001, verbose=False),
             iid='deprecated', n_jobs=None,
             param_grid={'C': [1, 5, 10], 'gamma': [0.0001, 0.001, 0.01]},
             pre_dispatch='2*n_jobs', refit=True, return_train_score=False,
             scoring=None, verbose=2)

In [42]:
# Print best parameters
print(grid3.best_params_)

{'C': 1, 'gamma': 0.0001}


In [43]:
# Print best score
print(grid3.best_score_)

0.912923076923077


In [45]:
# Make predictions with hypertuned model
predictions3 = grid3.predict(X_test3)

In [46]:
# Calculate predictions report
print(classification_report(y_test3, predictions3, target_names=["divorced", "married"]))

              precision    recall  f1-score   support

    divorced       0.95      0.95      0.95        21
     married       0.95      0.95      0.95        22

    accuracy                           0.95        43
   macro avg       0.95      0.95      0.95        43
weighted avg       0.95      0.95      0.95        43

