# Module 8: Linear Models

### Introducing Linear Regression

In [17]:
import pandas as pd 
import numpy as np
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import Ridge
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_validate

In [2]:
np.random.seed(0)
n = 50
X_1 = np.linspace(0,2,n)+np.random.randn(n)*0.01
X = pd.DataFrame(X_1[:,None], columns=['length'])
X.head()

Unnamed: 0,length
0,0.017641
1,0.044818
2,0.09142
3,0.144858
4,0.181941


In [3]:
y = abs(np.random.randn(n,1))*2 + X_1[:, None]*5
y = pd.DataFrame(y, columns=['weight'])
y.head()

Unnamed: 0,weight
0,1.879136
1,0.997894
2,1.47871
3,3.085554
4,0.966069


In [8]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=123)

In [9]:
rm = Ridge()
rm.fit(X_train, y_train);

In [10]:
rm.predict(X_train)[:5]

array([[ 5.68777859],
       [11.17960813],
       [ 3.61938463],
       [ 9.80015212],
       [ 2.22783103]])

In [11]:
rm.score(X_train, y_train)

0.9131303171985857

**alpha**: bigger values make the model less complex

In [12]:
rm2 = Ridge(alpha=10000)
rm2.fit(X_train, y_train);

In [13]:
rm2.score(X_train, y_train)

0.0024734002721027437

In [18]:
scores_dict = {
    "alpha": 10.0**np.arange(-2,6,1),
    "train_scores": list(),
    "cv_scores":list(),
}

for alpha in scores_dict['alpha']:
    ridge_model = Ridge(alpha=alpha)
    results = cross_validate(ridge_model, X_train, y_train, return_train_score=True)
    scores_dict['train_scores'].append(results["train_score"].mean())
    scores_dict["cv_scores"].append(results["test_score"].mean())

In [19]:
pd.DataFrame(scores_dict)

Unnamed: 0,alpha,train_scores,cv_scores
0,0.01,0.917359,0.892893
1,0.1,0.917281,0.893221
2,1.0,0.910626,0.889479
3,10.0,0.70356,0.674002
4,100.0,0.169554,0.087447
5,1000.0,0.01944,-0.079094
6,10000.0,0.001972,-0.098498
7,100000.0,0.000198,-0.10047


### Coefficients and coef_

In [21]:
housing_df = pd.read_csv("data/housing.csv")
train_df, test_df = train_test_split(housing_df, test_size=0.1, random_state=1)
train_df.head()

Unnamed: 0,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,median_house_value,ocean_proximity
5425,-118.42,34.02,34.0,2243.0,444.0,973.0,413.0,4.9676,414100.0,<1H OCEAN
12499,-121.46,38.57,52.0,810.0,172.0,326.0,151.0,3.1583,140000.0,INLAND
7020,-118.1,33.97,35.0,2426.0,529.0,2010.0,514.0,2.9922,163500.0,<1H OCEAN
6477,-118.08,34.09,32.0,3214.0,718.0,2316.0,751.0,3.7066,206800.0,<1H OCEAN
8670,-118.37,33.82,32.0,2815.0,607.0,1338.0,609.0,4.5687,381200.0,<1H OCEAN


In [26]:
X_train, y_train = train_df.drop(columns=["median_house_value", "ocean_proximity", "total_bedrooms"]), train_df["median_house_value"]
X_test, y_test = test_df.drop(columns=["median_house_value","ocean_proximity","total_bedrooms"]), test_df["median_house_value"]

In [27]:
X_train.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 18576 entries, 5425 to 235
Data columns (total 7 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   longitude           18576 non-null  float64
 1   latitude            18576 non-null  float64
 2   housing_median_age  18576 non-null  float64
 3   total_rooms         18576 non-null  float64
 4   population          18576 non-null  float64
 5   households          18576 non-null  float64
 6   median_income       18576 non-null  float64
dtypes: float64(7)
memory usage: 1.1 MB


In [29]:
lm = Ridge()
lm.fit(X_train, y_train);
training_score = lm.score(X_train, y_train)
training_score

0.6319807221851641

In [30]:
lm.coef_

array([-4.22348876e+04, -4.24083058e+04,  1.14085214e+03, -1.69515643e+00,
       -4.22416401e+01,  1.46421396e+02,  3.84011720e+04])

In [31]:
ridge_coeffs = lm.coef_
ridge_coeffs

array([-4.22348876e+04, -4.24083058e+04,  1.14085214e+03, -1.69515643e+00,
       -4.22416401e+01,  1.46421396e+02,  3.84011720e+04])

In [32]:
words_coeffs_df = pd.DataFrame(data=ridge_coeffs, index=X_train.columns, columns=['Coefficients'])
words_coeffs_df

Unnamed: 0,Coefficients
longitude,-42234.887639
latitude,-42408.305835
housing_median_age,1140.85214
total_rooms,-1.695156
population,-42.24164
households,146.421396
median_income,38401.17203


In [33]:
words_coeffs_df.abs().sort_values(by='Coefficients')

Unnamed: 0,Coefficients
total_rooms,1.695156
population,42.24164
households,146.421396
housing_median_age,1140.85214
median_income,38401.17203
longitude,42234.887639
latitude,42408.305835


**Interpreting learned coefficients**

In linear models:
* if the coefficient is positive, then increasing the feature values increases the prediction value
* if the coefficient is negative, then increasing the feature values decreases the prediction value
* if the coefficient is 0, the feature is not used in making a prediction

**Predicting**

In [34]:
X_train.iloc[0:1]

Unnamed: 0,longitude,latitude,housing_median_age,total_rooms,population,households,median_income
5425,-118.42,34.02,34.0,2243.0,973.0,413.0,4.9676


In [36]:
lm.predict(X_train.iloc[0:1])

array([281969.52599679])

In [37]:
words_coeffs_df.T

Unnamed: 0,longitude,latitude,housing_median_age,total_rooms,population,households,median_income
Coefficients,-42234.887639,-42408.305835,1140.85214,-1.695156,-42.24164,146.421396,38401.17203


In [38]:
X_train.iloc[0:1]

Unnamed: 0,longitude,latitude,housing_median_age,total_rooms,population,households,median_income
5425,-118.42,34.02,34.0,2243.0,973.0,413.0,4.9676


In [41]:
intercept = lm.intercept_
(ridge_coeffs * X_train.iloc[0:1]).sum(axis=1) + intercept

5425    281969.525997
dtype: float64

### Logistic Regression

In [68]:
cities_df = pd.read_csv("https://raw.githubusercontent.com/UBC-MDS/DSCI_571_sup-learn-1/master/lectures/data/canada_usa_cities.csv")
train_df, test_df = train_test_split(cities_df, test_size=0.2, random_state=123)

X_train, y_train = train_df.drop(columns=["country"], axis=1), train_df["country"]
X_test, y_test = test_df.drop(columns=["country"], axis=1), test_df["country"]

train_df.head()

Unnamed: 0,longitude,latitude,country
160,-76.4813,44.2307,Canada
127,-81.2496,42.9837,Canada
169,-66.058,45.2788,Canada
188,-73.2533,45.3057,Canada
187,-67.9245,47.1652,Canada


In [70]:
from sklearn.dummy import DummyClassifier
dc = DummyClassifier(strategy="prior")

scores = pd.DataFrame(cross_validate(dc, X_train, y_train, return_train_score=True))

In [71]:
from sklearn.linear_model import LogisticRegression

In [74]:
lr = LogisticRegression()
scores = pd.DataFrame(cross_validate(lr, X_train, y_train, return_train_score=True))
scores

Unnamed: 0,fit_time,score_time,test_score,train_score
0,0.087232,0.008018,0.852941,0.827068
1,0.04697,0.002546,0.823529,0.827068
2,0.010757,0.002385,0.69697,0.858209
3,0.008517,0.002186,0.787879,0.843284
4,0.010337,0.004234,0.939394,0.80597


**Coefficients**

In [75]:
lr = LogisticRegression()
lr.fit(X_train, y_train);

In [76]:
print("Model coefficients:", lr.coef_)
print("Model intercept:", lr.intercept_)

Model coefficients: [[-0.04108149 -0.33683126]]
Model intercept: [10.8869838]


**Predictions**

In [77]:
lr.classes_

array(['Canada', 'USA'], dtype=object)

In [79]:
example = X_test.iloc[0,:]
example.tolist()

[-64.8001, 46.098]

In [80]:
(example.tolist() * lr.coef_).sum(axis=1) + lr.intercept_

array([-1.97817876])

In [81]:
lr.predict([example])



array(['Canada'], dtype=object)

**Hyperparameter: C**
 Larger values increase the model’s complexity.

In [83]:
scores_dict = {
    "C": 10.0**np.arange(-6,2,1),
    "train_score": list(),
    "cv_score": list(),
}

for C in scores_dict['C']:
    lr_model = LogisticRegression(C=C)
    results = cross_validate(lr_model, X_train, y_train, return_train_score=True)
    scores_dict['train_score'].append(results["train_score"].mean())
    scores_dict["cv_score"].append(results["test_score"].mean())

In [84]:
pd.DataFrame(scores_dict)

Unnamed: 0,C,train_score,cv_score
0,1e-06,0.59881,0.59893
1,1e-05,0.59881,0.59893
2,0.0001,0.664707,0.658645
3,0.001,0.784424,0.790731
4,0.01,0.827842,0.826203
5,0.1,0.83232,0.820143
6,1.0,0.83232,0.820143
7,10.0,0.83232,0.820143


In [86]:
import scipy
from sklearn.model_selection import RandomizedSearchCV

param_grid = {
    "C": scipy.stats.uniform(0,100)}

lr = LogisticRegression()
grid_search = RandomizedSearchCV(lr, param_grid, cv=5, return_train_score=True, verbose=1, n_jobs=-1)
grid_search.fit(X_train, y_train);

Fitting 5 folds for each of 10 candidates, totalling 50 fits


In [87]:
grid_search.best_params_

{'C': 42.38550485581797}

In [89]:
grid_search.best_score_

0.8201426024955436

**Logistic regression with text data**

In [90]:
X = [
    "URGENT! As a valued network customer you have been selected to receive a prize!",
    "Lol you are always so convincing.",
    "Nah I don't think he goes to usf",
    "URGENT! You have won a 1 week FREE membership Jackpot!",
    "Had your mobile 11 months or more? U R entitles to Update to latest color mobiles for Free!",
    "As per your request, 'Melle Melle' has been set as your callertune for all Callers"]

y = ["spam", "non spam", "non spam", "spam", "spam", "non spam"]

In [91]:
from sklearn.feature_extraction.text import CountVectorizer

vec = CountVectorizer()
X_transformed = vec.fit_transform(X);
bow_df = pd.DataFrame(X_transformed.toarray(), columns=sorted(vec.vocabulary_), index=X)
bow_df

Unnamed: 0,11,all,always,are,as,been,callers,callertune,color,convincing,...,think,to,update,urgent,usf,valued,week,won,you,your
URGENT! As a valued network customer you have been selected to receive a prize!,0,0,0,0,1,1,0,0,0,0,...,0,1,0,1,0,1,0,0,1,0
Lol you are always so convincing.,0,0,1,1,0,0,0,0,0,1,...,0,0,0,0,0,0,0,0,1,0
Nah I don't think he goes to usf,0,0,0,0,0,0,0,0,0,0,...,1,1,0,0,1,0,0,0,0,0
URGENT! You have won a 1 week FREE membership Jackpot!,0,0,0,0,0,0,0,0,0,0,...,0,0,0,1,0,0,1,1,1,0
Had your mobile 11 months or more? U R entitles to Update to latest color mobiles for Free!,1,0,0,0,0,0,0,0,1,0,...,0,2,1,0,0,0,0,0,0,1
"As per your request, 'Melle Melle' has been set as your callertune for all Callers",0,1,0,0,2,1,1,1,0,0,...,0,0,0,0,0,0,0,0,0,2


In [92]:
lr_text_model = LogisticRegression()
lr_text_model.fit(X_transformed, y);

```
pd.DataFrame({'feature': vec.get_feature_names(),
              'coefficient': lr_text_model.coef_[0]})
```

### Predicting Probabilities

In [95]:
X_train.head()

Unnamed: 0,longitude,latitude
160,-76.4813,44.2307
127,-81.2496,42.9837
169,-66.058,45.2788
188,-73.2533,45.3057
187,-67.9245,47.1652


In [96]:
lr = LogisticRegression()
lr.fit(X_train, y_train);

In [97]:
lr.predict(X_test[:1])

array(['Canada'], dtype=object)

In [98]:
lr.predict_proba(X_test[:1])

array([[0.87848688, 0.12151312]])

In [99]:
predict_y = lr.predict(X_train)
predict_y[-5:]

array(['Canada', 'Canada', 'USA', 'Canada', 'Canada'], dtype=object)

In [100]:
y_proba = lr.predict_proba(X_train)
y_proba[-5:]

array([[0.69848481, 0.30151519],
       [0.76970638, 0.23029362],
       [0.05301712, 0.94698288],
       [0.63294488, 0.36705512],
       [0.81540165, 0.18459835]])

In [101]:
data_dict = {"y":y_train,
             "pred y": predict_y.tolist(),
             "probabilities": y_proba.tolist()}
pd.DataFrame(data_dict).tail(10)

Unnamed: 0,y,pred y,probabilities
96,Canada,Canada,"[0.7047596510140415, 0.2952403489859585]"
57,USA,USA,"[0.03121394423109436, 0.9687860557689056]"
123,Canada,Canada,"[0.6537036743991862, 0.3462963256008138]"
106,Canada,Canada,"[0.8444267867198362, 0.1555732132801638]"
83,Canada,Canada,"[0.6537036743991862, 0.3462963256008138]"
17,USA,Canada,"[0.6984848138411378, 0.3015151861588622]"
98,Canada,Canada,"[0.7697063812753013, 0.23029361872469864]"
66,USA,USA,"[0.05301711626872618, 0.9469828837312738]"
126,Canada,Canada,"[0.6329448842395049, 0.36705511576049504]"
109,Canada,Canada,"[0.8154016516676704, 0.18459834833232955]"


In [103]:
lr_targets = pd.DataFrame({"y":y_train,
                           "pred y": predict_y.tolist(),
                           "probability_canada": y_proba[:,0].tolist()})
lr_targets.head(3)

Unnamed: 0,y,pred y,probability_canada
160,Canada,Canada,0.704607
127,Canada,Canada,0.563017
169,Canada,Canada,0.838968


In [104]:
lr_targets.sort_values(by='probability_canada')

Unnamed: 0,y,pred y,probability_canada
37,USA,USA,0.006547
78,USA,USA,0.007685
34,USA,USA,0.008317
41,USA,USA,0.008958
38,USA,USA,0.009194
...,...,...,...
149,Canada,Canada,0.924004
81,Canada,Canada,0.931792
0,USA,Canada,0.932487
165,Canada,Canada,0.951092


In [105]:
X_train.loc[[1,37]]

Unnamed: 0,longitude,latitude
1,-134.4197,58.3019
37,-98.4951,29.4246


In [107]:
lr_targets = pd.DataFrame({"y":y_train,
                           "pred y": predict_y.tolist(),
                           "prob_difference": (abs(y_proba[:,0] - y_proba[:,1])).tolist()})
lr_targets.sort_values(by="prob_difference").head()

Unnamed: 0,y,pred y,prob_difference
61,USA,USA,0.001719
54,USA,USA,0.020025
13,USA,USA,0.020025
130,Canada,USA,0.022234
92,Canada,USA,0.022234


In [108]:
X_train.loc[[61,54]]

Unnamed: 0,longitude,latitude
61,-87.9225,43.035
54,-83.0466,42.3316


### Multi-class Classification

More than 2 classes

In [115]:
data = pd.read_csv("data/wine.csv")
X = data.drop(columns=['Class'])
y = data['Class']
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=2021)

In [116]:
X_train.head()

Unnamed: 0,Alcohol,MalicAcid,Ash,Alcalinity,Magnesium,Phenols,Flavanoids,Nonflavanoid,Proanthocyanins,ColorIntensity,Hue,OD280/OD315,Proline
36,13.28,1.64,2.84,15.5,110,2.6,2.68,0.34,1.36,4.6,1.09,2.78,880
77,11.84,2.89,2.23,18.0,112,1.72,1.32,0.43,0.95,2.65,0.96,2.52,500
131,12.88,2.99,2.4,20.0,104,1.3,1.22,0.24,0.83,5.4,0.74,1.42,530
159,13.48,1.67,2.64,22.5,89,2.6,1.1,0.52,2.29,11.75,0.57,1.78,620
4,13.24,2.59,2.87,21.0,118,2.8,2.69,0.39,1.82,4.32,1.04,2.93,735


In [117]:
y_train[:5]

  y_train[:5]


36     1
77     2
131    3
159    3
4      1
Name: Class, dtype: int64

In [118]:
lr = LogisticRegression(max_iter=10000)
lr.fit(X_train, y_train);

In [119]:
lr.classes_

array([1, 2, 3])

In [120]:
lr.predict(X_test[:5])

array([1, 2, 1, 3, 2])

In [121]:
lr.coef_

array([[ 0.53321486,  0.43407894,  0.38247398, -0.0471342 , -0.0256897 ,
         0.54161293,  0.87376141, -0.00726817, -0.09397824,  0.21687767,
         0.02966878,  0.61730404,  0.01101912],
       [-0.70179586, -0.80272252, -0.45933962, -0.09299408, -0.03494651,
        -0.06706067,  0.26562443,  0.15061604,  0.78244563, -0.9361069 ,
         0.25611067, -0.02816946, -0.00915907],
       [ 0.168581  ,  0.36864357,  0.07686564,  0.14012829,  0.06063621,
        -0.47455227, -1.13938583, -0.14334786, -0.68846738,  0.71922923,
        -0.28577945, -0.58913457, -0.00186005]])

In [122]:
lr.coef_.shape

(3, 13)

In [123]:
lr_coefs = pd.DataFrame(data=lr.coef_.T, index=X_train.columns, columns=lr.classes_)
lr_coefs

Unnamed: 0,1,2,3
Alcohol,0.533215,-0.701796,0.168581
MalicAcid,0.434079,-0.802723,0.368644
Ash,0.382474,-0.45934,0.076866
Alcalinity,-0.047134,-0.092994,0.140128
Magnesium,-0.02569,-0.034947,0.060636
Phenols,0.541613,-0.067061,-0.474552
Flavanoids,0.873761,0.265624,-1.139386
Nonflavanoid,-0.007268,0.150616,-0.143348
Proanthocyanins,-0.093978,0.782446,-0.688467
ColorIntensity,0.216878,-0.936107,0.719229


In [124]:
lr.predict_proba(X_test)[:5]

array([[9.95321725e-01, 3.94845597e-03, 7.29819207e-04],
       [1.61731185e-04, 9.98297245e-01, 1.54102428e-03],
       [9.99725704e-01, 6.27412547e-05, 2.11554460e-04],
       [2.67178320e-05, 9.40417080e-06, 9.99963878e-01],
       [5.85673990e-06, 9.99193092e-01, 8.01051099e-04]])

In [125]:
lr.predict_proba(X_test[:5]).sum(axis=1)

array([1., 1., 1., 1., 1.])

In [128]:
from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report

In [127]:
confusion_matrix(y_test, lr.predict(X_test))

array([[19,  0,  0],
       [ 1, 16,  0],
       [ 0,  1,  8]])

In [129]:
print(classification_report(y_test, lr.predict(X_test)))

              precision    recall  f1-score   support

           1       0.95      1.00      0.97        19
           2       0.94      0.94      0.94        17
           3       1.00      0.89      0.94         9

    accuracy                           0.96        45
   macro avg       0.96      0.94      0.95        45
weighted avg       0.96      0.96      0.96        45



In [130]:
x_train_2d = X_train[['Alcohol', 'MalicAcid']]
x_train_2d.head(3)

Unnamed: 0,Alcohol,MalicAcid
36,13.28,1.64
77,11.84,2.89
131,12.88,2.99
