# Imports

In [None]:
import pandas as pd
from sklearn.pipeline import make_pipeline
from matplotlib import pyplot as plt
import numpy as np
%matplotlib inline
from patsy import dmatrices
from sklearn import cross_validation as cv
from sklearn import linear_model, datasets, metrics
import statsmodels.api as smf
from sklearn.cross_validation import KFold
from sklearn.preprocessing import PolynomialFeatures, StandardScaler
import warnings
warnings.filterwarnings('ignore')

pd.set_option('display.mpl_style', 'default')               # Produces pretty plots!
plt.rcParams['figure.figsize'] = (15, 15) 

# Read data

In [None]:
data = pd.read_csv('result.csv')
#try:
    #data = data.drop(['western', 'war', 'Unnamed: 0.1', 'noir', 'mystery', 'Unnamed: 0', 'crime', 
                      #'adventure', 'animation', 'fantasy', 'musical', 'index',
                      #'romance' ,'horror', 'scifi', 'id', 'dates', 'title.1'], axis=1)
#except:
    #pass
data['domestic gross'] = data['domestic gross'].str.replace('$', '').str.replace(',', '').astype(float)
data['production budget'] = data['production budget'].str.replace('$', '').str.replace(',', '').astype(float)
#data.columns = (['action', 'children', 'comedy', 'date', 'docmentary', 'drama', 'thriller',
                #'title', 'dgross', 'pbudget', 'wgross'])
data = data[data['domestic gross'] > 0]
data['dgross'] = data['domestic gross']
data['pbudget'] = data['production budget']

In [None]:
data.describe()

In [None]:
data['log_budget']=np.log(data.pbudget)
data['log_gross']=np.log(data.dgross)

In [None]:
#data.replace([np.inf, -np.inf], np.nan)
#data.dropna(inplace=True)

In [None]:
# Lets take a look at our entire model
pd.tools.plotting.scatter_matrix(data[['pbudget', 'dgross']])
print ''

In [None]:
pd.tools.plotting.scatter_matrix(data[['log_gross', 'log_budget']])
print ''

In [None]:
y, X = dmatrices('log_gross ~ log_budget + thriller + comedy + drama + documentary + action + animation + horror + fantasy + romance', data=data, return_type='dataframe')

# Regular regression (statsmodels)

In [None]:
model = smf.OLS(y, X)
results = model.fit()

In [None]:
results.summary()

# sklearn

In [None]:
y, X = dmatrices('dgross ~ pbudget', data=data, return_type='dataframe')

In [None]:
model = linear_model.LinearRegression()
model.fit(X, y)
model.score(X, y)

In [None]:
print model.coef_
print model.intercept_

# log transformation

In [None]:
y, X = dmatrices('log_gross ~ log_budget', data=data, return_type='dataframe')

In [None]:
model = linear_model.LinearRegression()
model.fit(X, y)
model.score(X, y)

# Lasso/L1 regularization
### good for large datasets / many features (will cause some to go to 0)

In [None]:
y, X = dmatrices('log_gross ~ log_budget + thriller + comedy + drama + documentary + action + animation + horror + fantasy + romance', data=data, return_type='dataframe')

In [None]:
import sklearn

In [None]:
#X = sklearn.preprocesnormalize(X, axis=0)
#y = sklearn.preprocesnormalize(y, axis=0)
x_train, x_test, y_train, y_test = cv.train_test_split(X, y, test_size=0.20, random_state=1234)
model_lasso1 = linear_model.LassoCV(eps=0.001, n_alphas=100, cv=10, normalize=True).fit(x_train, sklearn.utils.column_or_1d(y_train))

#model_lasso1.predict(x_test,y_test)
print(metrics.mean_squared_error(y_train, model_lasso1.predict(x_train)))
print(metrics.mean_squared_error(y_test, model_lasso1.predict(x_test)))

print('alpha=', model_lasso1.alpha_)
m_alphas = model_lasso1.alphas_
print model_lasso1.coef_

In [None]:
model_lasso = linear_model.Lasso()
model_lasso.fit(X, y)
model_lasso.coef_

In [None]:
model_lasso.score(X, y)

# Ridge/L2 regularization -
### causes outliers to be less likely

In [None]:
model_ridge = linear_model.Ridge()
model_ridge.fit(X, y)
model_ridge.coef_

In [None]:
model_ridge.score(X, y)

# cross-validation

In [None]:
y, X = dmatrices('dgross ~ pbudget', data=data, return_type='dataframe')

In [None]:
X.head()

In [None]:
model = linear_model.LinearRegression()
print model.fit(X, y).score(X, y)
from sklearn.cross_validation import train_test_split
for k in xrange(10):
    X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=.8)
    print model.fit(X_train, y_train).score(X_test, y_test).round(3), 

In [None]:
y_pred = model.predict(X)
MAE = np.abs(y - y_pred).mean()
MSE = np.square(y_pred - y).mean()
print MAE, MSE

In [None]:
x_train, x_test, y_train, y_test = cv.train_test_split(X,y,test_size=0.333,random_state=1234)

In [None]:
model = linear_model.LinearRegression().fit(x_train, y_train)

In [None]:
model.score(x_train, y_train)
model.score(x_test, y_test)

In [None]:
len(y_test)

In [None]:
print(X_test.shape)
print(y_test.shape)

In [None]:
(X_test).T

In [None]:
model.predict(X_test)

In [None]:
plt.rcParams['figure.figsize'] = (5, 5) 
plt.scatter(y_test, model.predict(X_test))
#plt.plot((X_test).T, model.predict(X_test))
#plt.scatter(, y_test)
plt.show()

# Regression line

In [None]:
y, X = dmatrices('dgross ~ pbudget + documentary + drama + comedy + thriller + fantasy + animation', data=data, return_type='dataframe')

In [None]:
x_sample = data.log_budget
y_sample = data.log_gross

In [None]:
def analyze_performance(test_model):
    scores = {'overfit': {}, 'cv': {}}
    for degree in xrange(0, 30):
        model = make_pipeline(StandardScaler(), PolynomialFeatures(degree), test_model)    
        scores['overfit'][degree] = model.fit(X, y_sample).score(X, y_sample)
        cv_scores = []
        for k in xrange(15):  # Compute a few R2 scores and print average performance
            X_train, X_test, y_train, y_test = train_test_split(X, y_sample, train_size=.7)
            cv_scores.append(model.fit(X_train, y_train).score(X_test, y_test))
        scores['cv'][degree] = np.mean(cv_scores)
    return pd.DataFrame(scores)

In [None]:
scores = analyze_performance(linear_model.LinearRegression())
f = scores.plot(ylim=(-.05,1.05))
f = plt.title("Best cv performance at degree %d" % scores.cv.argmax()), plt.xlabel('degree'), plt.ylabel('$R^2$')

In [None]:
domain = np.linspace(data.pbudget.min(), data.pbudget.max())

In [None]:
scores

In [None]:
domain = np.linspace(40000000, 150000000)
x_small_sample = data.log_budget
y_small_sample = data.log_gross

degree, alpha = 1, 10

X = np.array([x_small_sample]).T
fig, axes = plt.subplots(1, 3, figsize=(30, 10))
for no, my_model in enumerate([linear_model.LinearRegression(), linear_model.Ridge(alpha=alpha), linear_model.Lasso(alpha=alpha)]):    
    model = make_pipeline(PolynomialFeatures(degree), my_model)    
    r2, MSE = [], []
    for k in xrange(60):  # Fit a few times the model to different training sets
        X_train, X_test, y_train, y_test = train_test_split(X, y_small_sample, train_size=.7)
        r2.append(model.fit(X_train, y_train).score(X_test, y_test))
        y_pred = model.predict(np.array([domain]).T)
        axes[no].plot(domain, y_pred, alpha=.3)
        y_pred_sample = model.predict(np.array([x_small_sample]).T)
        MSE.append(np.square(y_pred_sample - y_small_sample).sum())
    axes[no].scatter(x_small_sample, y_small_sample, s=70)
    axes[no].set_title("%s (R2 %.2f, MSE %3d)" % (my_model.__class__.__name__, np.mean(scores.cv), np.mean(MSE)))
    axes[no].set_xlim(min(domain), max(domain)), axes[no].set_ylim(-200, 200000000)

In [None]:
from sklearn.cross_validation import cross_val_score, train_test_split

In [None]:
features = ['pbudget', 'comedy', 'drama', 'scifi', 'animation', 'documentary', 'thriller', 'mystery', 'musical', 'children']
X, y = data[features[:1]], data.dgross
model = linear_model.LinearRegression()
cross_val_score(model, X, y, cv=10).mean()
#more features = better score

In [None]:
features = ['horror', 'pbudget', 'comedy', 'drama', 'scifi']
X, y = data[features], data.dgross
model = linear_model.LinearRegression()
cross_val_score(model, X, y, cv=10).mean()