In [1]:
%reload_ext nb_black

<IPython.core.display.Javascript object>

# Linear SVC Assignment

In [2]:
import numpy as np
import pandas as pd

from sklearn.svm import SVC
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, confusion_matrix

<IPython.core.display.Javascript object>

### Import the admissions data set (admissions.csv).

In [3]:
data = pd.read_csv(
    "https://tf-assets-prod.s3.amazonaws.com/tf-curric/data-science/admissions.csv"
)
data.head()

Unnamed: 0,GRE,TOEFL,SchoolRank,SOP,LOR,GPA,Research,Admitted
0,337,118,4,4.5,4.5,9.65,1,1
1,324,107,4,4.0,4.5,8.87,1,1
2,316,104,3,3.0,3.5,8.0,1,1
3,322,110,3,3.5,2.5,8.67,1,1
4,314,103,2,2.0,3.0,8.21,0,0


<IPython.core.display.Javascript object>

In [15]:
data["SchoolRank"].value_counts()

3    133
2    107
4     74
5     60
1     26
Name: SchoolRank, dtype: int64

<IPython.core.display.Javascript object>

### Split the data into training and test sets, with the test set comprising 30% of the data.  Use `'Admitted'` as the target.

In [4]:
X = data.drop(columns=["Admitted"])
y = data["Admitted"]

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.30, random_state=13
)

<IPython.core.display.Javascript object>

### Generate an SVC model with a linear kernel. Set the regularization parameter (C) = 10. Check the score for both train and test sets. 

In [7]:
model = SVC(C=10, kernel="linear")
model.fit(X_train, y_train)

SVC(C=10, break_ties=False, cache_size=200, class_weight=None, coef0=0.0,
    decision_function_shape='ovr', degree=3, gamma='scale', kernel='linear',
    max_iter=-1, probability=False, random_state=None, shrinking=True,
    tol=0.001, verbose=False)

<IPython.core.display.Javascript object>

In [8]:
model.score(X_train, y_train)

0.8785714285714286

<IPython.core.display.Javascript object>

In [9]:
model.score(X_test, y_test)

0.8416666666666667

<IPython.core.display.Javascript object>

### Choose some other values for C and show the difference between the scores for the train and test sets.

In [11]:
cs = [0.10,1,10,100,1000]

for c in cs:
    model = SVC(C=c, kernel="linear")
    model.fit(X_train, y_train)
    print(c)
    print(model.score(X_train, y_train))
    print(model.score(X_test, y_test))


0.1
0.85
0.8666666666666667
1
0.8714285714285714
0.85
10
0.8785714285714286
0.8416666666666667
100
0.8892857142857142
0.8583333333333333
1000
0.8607142857142858
0.85


<IPython.core.display.Javascript object>

### What if we switched up the target variable? Let assume that we know whether a student was admitted. Let's try to predict what their SchoolRank was. 

Create an SVC model with a linear kernel with the SchoolRank field as the target variable. Report both the train and the test scores.

In [12]:
X = data.drop(columns=["SchoolRank"])
y = data["SchoolRank"]

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.30, random_state=13
)

<IPython.core.display.Javascript object>

In [13]:
model = SVC(kernel="linear")
model.fit(X_train, y_train)

SVC(C=1.0, break_ties=False, cache_size=200, class_weight=None, coef0=0.0,
    decision_function_shape='ovr', degree=3, gamma='scale', kernel='linear',
    max_iter=-1, probability=False, random_state=None, shrinking=True,
    tol=0.001, verbose=False)

<IPython.core.display.Javascript object>

In [14]:
cs = [0.10,1,10,100,1000]

for c in cs:
    model = SVC(C=c, kernel="linear")
    model.fit(X_train, y_train)
    print(c)
    print(model.score(X_train, y_train))
    print(model.score(X_test, y_test))


0.1
0.5964285714285714
0.5583333333333333
1
0.6571428571428571
0.5416666666666666
10
0.65
0.575
100
0.6214285714285714
0.6083333333333333
1000
0.625
0.5916666666666667


<IPython.core.display.Javascript object>

### Show confusion matrices for the training and test sets, and a classification report for the test set. What trends do you notice?

In [17]:
y_pred = model.predict(X_test)

<IPython.core.display.Javascript object>

In [18]:
confusion_matrix(y_test, y_pred)

array([[ 2,  7,  0,  0,  0],
       [ 1, 19,  7,  0,  0],
       [ 0,  9, 31,  3,  1],
       [ 0,  0,  6, 10,  6],
       [ 0,  1,  1,  7,  9]], dtype=int64)

<IPython.core.display.Javascript object>

In [19]:
confusion_df = pd.DataFrame(
    confusion_matrix(y_test, y_pred),
    index=[
        "Actually Rank 1",
        "Actually Rank 2",
        "Actually Rank 3",
        "Actually Rank 4",
        "Actually Rank 5",
    ],
    columns=[
        "Predicted Rank 1",
        "Predicted Rank 2",
        "Predicted Rank 3",
        "Predicted Rank 4",
        "Predicted Rank 5",
    ],
)

confusion_df.style.background_gradient(axis=None)

Unnamed: 0,Predicted Rank 1,Predicted Rank 2,Predicted Rank 3,Predicted Rank 4,Predicted Rank 5
Actually Rank 1,2,7,0,0,0
Actually Rank 2,1,19,7,0,0
Actually Rank 3,0,9,31,3,1
Actually Rank 4,0,0,6,10,6
Actually Rank 5,0,1,1,7,9


<IPython.core.display.Javascript object>

In [21]:
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           1       0.67      0.22      0.33         9
           2       0.53      0.70      0.60        27
           3       0.69      0.70      0.70        44
           4       0.50      0.45      0.48        22
           5       0.56      0.50      0.53        18

    accuracy                           0.59       120
   macro avg       0.59      0.52      0.53       120
weighted avg       0.60      0.59      0.58       120



<IPython.core.display.Javascript object>