In [31]:
import pandas as pd
import numpy as np
import sklearn as sk
import matplotlib.pyplot as plt
from sklearn.utils import shuffle
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import confusion_matrix
import pickle

In [55]:
dataset = pd.read_csv("divorce.csv", sep=";")

In [56]:
X = dataset.iloc[:, :-1].values / 4.0
Y = dataset.iloc[:, -1].values

In [4]:
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.1)

In [5]:
classifier = LogisticRegression(solver='lbfgs')

In [6]:
classifier.fit(X_train, Y_train)

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='warn',
          n_jobs=None, penalty='l2', random_state=None, solver='lbfgs',
          tol=0.0001, verbose=0, warm_start=False)

In [7]:
y_prediction = classifier.predict(X_test)

In [8]:
y_prediction

array([1, 1, 0, 0, 0, 1, 1, 0, 1, 1, 1, 1, 0, 0, 1, 1, 1], dtype=int64)

In [9]:
Y_test

array([1, 1, 0, 0, 0, 1, 1, 0, 1, 1, 1, 1, 0, 0, 1, 1, 1], dtype=int64)

In [10]:
classifier.score(X_train, Y_train)

0.9803921568627451

In [11]:
classifier.score(X_test, Y_test)

1.0

In [12]:
print("Test set results:")
y_prediction_proba = classifier.predict_proba(X_test)
for i in range(17):
    print(f"probability: {round(y_prediction_proba[i][1], 3)}\tprediction: {y_prediction[i]}\treal: {Y_test[i]}")
print(f"Prediction score: {classifier.score(X_test, Y_test)}")

Test set results:
probability: 1.0	prediction: 1	real: 1
probability: 1.0	prediction: 1	real: 1
probability: 0.032	prediction: 0	real: 0
probability: 0.023	prediction: 0	real: 0
probability: 0.012	prediction: 0	real: 0
probability: 0.999	prediction: 1	real: 1
probability: 1.0	prediction: 1	real: 1
probability: 0.03	prediction: 0	real: 0
probability: 1.0	prediction: 1	real: 1
probability: 1.0	prediction: 1	real: 1
probability: 1.0	prediction: 1	real: 1
probability: 1.0	prediction: 1	real: 1
probability: 0.038	prediction: 0	real: 0
probability: 0.066	prediction: 0	real: 0
probability: 0.998	prediction: 1	real: 1
probability: 1.0	prediction: 1	real: 1
probability: 1.0	prediction: 1	real: 1
Prediction score: 1.0


In [13]:
classifier.coef_

array([[ 4.60532181e-01,  4.43598341e-01,  5.66420667e-01,
         2.53401578e-01,  2.40559612e-01,  5.12110716e-01,
         5.57418316e-02,  2.13998247e-01,  2.57725882e-01,
         1.29233689e-01,  3.73314414e-01,  3.00865443e-01,
        -2.32925950e-02,  3.81745422e-01,  4.80248076e-01,
         2.75300622e-01,  5.48364684e-01,  5.26818197e-01,
         4.18828342e-01,  4.85123397e-01,  1.39772391e-01,
         7.24883800e-04,  9.94104475e-04,  4.99933371e-04,
         2.80495551e-01,  6.86113314e-01,  2.25310106e-01,
         4.50567734e-01,  2.99576194e-01,  3.62402065e-01,
         5.45583228e-01,  2.89176471e-01,  2.93041452e-01,
         3.76243478e-01,  1.66265291e-01,  3.29202634e-01,
         1.50037618e-01,  3.85272742e-01,  5.93189892e-01,
         9.86373809e-01,  4.38794238e-01,  3.02890227e-01,
         2.08856273e-01,  6.44209796e-01,  7.00676846e-02,
         7.21151367e-02,  2.90102169e-03, -6.47899732e-02,
         6.42777056e-01,  2.95224195e-01, -4.79535574e-0

In [14]:
coefs = abs(classifier.coef_[0])

In [15]:
questions_to_remove = coefs.argsort()[:24]
print(questions_to_remove)

[23 21 22 46 50 12 53  6 47 44 45  9 20 36 34 42  7 26  4  3  8 15 24 31]


In [16]:
dataset2 = dataset.copy()
for question in questions_to_remove:
    dataset2 = dataset2.drop(f"Atr{question + 1}", axis=1)

In [17]:
X2 = dataset2.iloc[:, :-1].values / 4.0
Y2 = dataset2.iloc[:, -1].values

In [18]:
X2_train, X2_test, Y2_train, Y2_test = train_test_split(X2, Y2, test_size=0.1)

In [33]:
classifier2 = LogisticRegression(solver='lbfgs')
classifier2.fit(X2_train, Y2_train)

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='warn',
          n_jobs=None, penalty='l2', random_state=None, solver='lbfgs',
          tol=0.0001, verbose=0, warm_start=False)

In [34]:
y2_prediction = classifier2.predict(X2_test)

In [35]:
classifier2.score(X2_train, Y2_train)

0.9869281045751634

In [36]:
classifier2.score(X2_test, Y2_test)

1.0

In [37]:
print("Test set results:")
y2_prediction_proba = classifier2.predict_proba(X2_test)
for i in range(17):
    print(f"probability: {round(y2_prediction_proba[i][1], 3)}\tprediction: {y2_prediction[i]}\treal: {Y2_test[i]}")
print(f"Prediction score: {classifier2.score(X2_test, Y2_test)}")

Test set results:
probability: 1.0	prediction: 1	real: 1
probability: 0.023	prediction: 0	real: 0
probability: 0.016	prediction: 0	real: 0
probability: 0.037	prediction: 0	real: 0
probability: 0.999	prediction: 1	real: 1
probability: 0.038	prediction: 0	real: 0
probability: 1.0	prediction: 1	real: 1
probability: 0.034	prediction: 0	real: 0
probability: 0.999	prediction: 1	real: 1
probability: 0.04	prediction: 0	real: 0
probability: 0.997	prediction: 1	real: 1
probability: 0.999	prediction: 1	real: 1
probability: 0.999	prediction: 1	real: 1
probability: 0.034	prediction: 0	real: 0
probability: 0.999	prediction: 1	real: 1
probability: 0.053	prediction: 0	real: 0
probability: 0.046	prediction: 0	real: 0
Prediction score: 1.0


In [24]:
questions_to_remove = coefs.argsort()[:44]
print(questions_to_remove)

[23 21 22 46 50 12 53  6 47 44 45  9 20 36 34 42  7 26  4  3  8 15 24 31
 32 49 28 11 41 35 29 10 33 13 37 52 18 40  1 51 27  0 14 19]


In [25]:
dataset3 = dataset.copy()
for question in questions_to_remove:
    dataset3 = dataset3.drop(f"Atr{question + 1}", axis=1)

In [26]:
X3 = dataset3.iloc[:, :-1].values / 4.0
Y3 = dataset3.iloc[:, -1].values
X3_train, X3_test, Y3_train, Y3_test = train_test_split(X3, Y3, test_size=0.1)
classifier3 = LogisticRegression(solver='lbfgs')
classifier3.fit(X3_train, Y3_train)
y3_prediction = classifier3.predict(X3_test)

In [27]:
classifier3.score(X3_train, Y3_train)

0.9869281045751634

In [28]:
print("Test set results:")
y3_prediction_proba = classifier3.predict_proba(X3_test)
for i in range(17):
    print(f"probability: {round(y3_prediction_proba[i][1], 3)}\tprediction: {y3_prediction[i]}\treal: {Y3_test[i]}")
print(f"Prediction score: {classifier3.score(X3_test, Y3_test)}")

Test set results:
probability: 0.01	prediction: 0	real: 0
probability: 0.986	prediction: 1	real: 1
probability: 0.014	prediction: 0	real: 0
probability: 0.016	prediction: 0	real: 0
probability: 0.034	prediction: 0	real: 0
probability: 0.38	prediction: 0	real: 1
probability: 0.033	prediction: 0	real: 0
probability: 0.038	prediction: 0	real: 0
probability: 0.038	prediction: 0	real: 0
probability: 0.991	prediction: 1	real: 1
probability: 0.987	prediction: 1	real: 1
probability: 0.99	prediction: 1	real: 1
probability: 0.05	prediction: 0	real: 0
probability: 0.063	prediction: 0	real: 0
probability: 0.967	prediction: 1	real: 1
probability: 0.984	prediction: 1	real: 1
probability: 0.972	prediction: 1	real: 1
Prediction score: 0.9411764705882353


In [29]:
dataset3

Unnamed: 0,Atr3,Atr6,Atr17,Atr18,Atr26,Atr31,Atr39,Atr40,Atr44,Atr49,Class
154,1,1,0,1,0,1,1,1,1,1,0
130,0,0,0,0,1,0,1,0,2,2,0
65,3,1,3,3,3,4,4,3,3,4,1
95,0,0,0,0,0,0,0,0,0,2,0
161,2,1,2,1,0,0,0,0,0,1,0
62,3,2,3,2,2,4,4,4,4,4,1
60,3,2,3,2,2,4,4,4,4,4,1
101,1,1,0,0,0,0,0,0,0,0,0
162,0,2,0,0,0,0,0,0,2,2,0
67,2,2,4,2,2,4,4,3,3,4,1


In [38]:
pickle.dump(classifier, open('divorceModelComplex.pkl','wb'))
pickle.dump(classifier2, open('divorceModelMiddle.pkl','wb'))
pickle.dump(classifier3, open('divorceModelSimplyfied.pkl','wb'))

array([[1., 2., 3.]])

AttributeError: 'SelectKBest' object has no attribute 'pvalues_'