In [1]:
# imports
import pandas as pd
import matplotlib.pyplot as plt
# follow the usual sklearn pattern: import, instantiate, fit
from sklearn.linear_model import LinearRegression
# this allows plots to appear directly in the notebook
%matplotlib inline

In [18]:
# read data into a DataFrame
data = pd.read_csv('http://www-bcf.usc.edu/~gareth/ISL/Advertising.csv', index_col=0)
data.head()

Unnamed: 0,TV,Radio,Newspaper,Sales
1,230.1,37.8,69.2,22.1
2,44.5,39.3,45.1,10.4
3,17.2,45.9,69.3,9.3
4,151.5,41.3,58.5,18.5
5,180.8,10.8,58.4,12.9


In [19]:
# create X and y
feature_cols = ['TV', 'Radio', 'Newspaper']
X = data[feature_cols]
y = data.Sales

# follow the usual sklearn pattern: import, instantiate, fit
lm = LinearRegression()
lm.fit(X, y)

# print intercept and coefficients
print lm.intercept_
print lm.coef_

2.93888936946
[ 0.04576465  0.18853002 -0.00103749]


In [20]:
# pair the feature names with the coefficients
zip(feature_cols, lm.coef_)

[('TV', 0.045764645455397601),
 ('Radio', 0.18853001691820448),
 ('Newspaper', -0.001037493042476266)]

In [21]:
# predict for a new observation
lm.predict([100, 25, 25])

array([ 12.20266701])

In [22]:
# calculate the R-squared
lm.score(X, y)

0.89721063817895219

In [23]:
import numpy as np
np.random.seed(123456)

nums = np.random.rand(len(data))
mask_large = nums > 0.5

# initially set Size to small, then change roughly half to be large
data['Size'] = 'small'
data.loc[mask_large, 'Size'] = 'large'
data.head()

# assign roughly one third of observations to each group
nums = np.random.rand(len(data))
mask_suburban = (nums > 0.33) & (nums < 0.66)
mask_urban = nums > 0.66
data['Area'] = 'rural'
data.loc[mask_suburban, 'Area'] = 'suburban'
data.loc[mask_urban, 'Area'] = 'urban'
data.head()

Unnamed: 0,TV,Radio,Newspaper,Sales,Size,Area
1,230.1,37.8,69.2,22.1,small,suburban
2,44.5,39.3,45.1,10.4,large,suburban
3,17.2,45.9,69.3,9.3,small,suburban
4,151.5,41.3,58.5,18.5,large,suburban
5,180.8,10.8,58.4,12.9,small,urban


In [24]:
data['IsLarge'] = data.Size.map({'small':0, 'large':1})
data.head()

Unnamed: 0,TV,Radio,Newspaper,Sales,Size,Area,IsLarge
1,230.1,37.8,69.2,22.1,small,suburban,0
2,44.5,39.3,45.1,10.4,large,suburban,1
3,17.2,45.9,69.3,9.3,small,suburban,0
4,151.5,41.3,58.5,18.5,large,suburban,1
5,180.8,10.8,58.4,12.9,small,urban,0


In [26]:
# create three dummy variables using get_dummies, then exclude the first dummy column
area_dummies = pd.get_dummies(data.Area, prefix='Area').iloc[:, 1:]
# area_dummies
# concatenate the dummy variable columns onto the original DataFrame (axis=0 means rows, axis=1 means columns)
data2 = pd.concat([data, area_dummies], axis=1)
data2.head()

Unnamed: 0,TV,Radio,Newspaper,Sales,Size,Area,IsLarge,Area_suburban,Area_urban
1,230.1,37.8,69.2,22.1,small,suburban,0,1,0
2,44.5,39.3,45.1,10.4,large,suburban,1,1,0
3,17.2,45.9,69.3,9.3,small,suburban,0,1,0
4,151.5,41.3,58.5,18.5,large,suburban,1,1,0
5,180.8,10.8,58.4,12.9,small,urban,0,0,1


In [28]:
feature_cols = ['TV', 'Radio', 'Newspaper', 'IsLarge', 'Area_suburban', 'Area_urban']
X = data2[feature_cols]
y = data2.Sales

In [29]:
# instantiate, fit
lm = LinearRegression()
lm.fit(X, y)

LinearRegression(copy_X=True, fit_intercept=True, n_jobs=1, normalize=False)

In [30]:
# print coefficients
zip(feature_cols, lm.coef_)

[('TV', 0.045840781190506066),
 ('Radio', 0.18797890142601903),
 ('Newspaper', -0.00095894063364709153),
 ('IsLarge', 0.31584208429078753),
 ('Area_suburban', -0.035573252696973034),
 ('Area_urban', 0.13853432364986429)]