In [None]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
df = pd.read_csv("Movie_classification.csv", header=0)
df.head()

### Missing Value Imputation

In [None]:
df['Time_taken'].mean()

In [None]:
df['Time_taken'].fillna(value = df['Time_taken'].mean(), inplace = True)

In [None]:
df.info()

### Dummy Variable Creation

In [None]:
df.head()

In [None]:
df = pd.get_dummies(df,columns = ["3D_available","Genre"],drop_first = True)

In [None]:
df.head()

### X-y split

In [None]:
X = df.loc[:,df.columns!="Start_Tech_Oscar"]
type(X)

In [None]:
X.head()

In [None]:
X.shape

In [None]:
y = df["Start_Tech_Oscar"]
type(y)

In [None]:
y.head()

In [None]:
y.shape

### Test-Train Split

In [None]:
from sklearn.model_selection import train_test_split

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y,test_size=0.2,random_state=0)

In [None]:
X_train.head()

In [None]:
X_train.shape

In [None]:
X_test.shape

### Training Classification Tree

In [None]:
from sklearn import tree
clftree = tree.DecisionTreeClassifier(max_depth = 3)

In [None]:
clftree.fit(X_train, y_train)

### Predict values using trained model

In [None]:
y_train_pred = clftree.predict(X_train)
y_test_pred = clftree.predict(X_test)

In [None]:
y_test_pred

### Model Performance

In [None]:
from sklearn.metrics import accuracy_score, confusion_matrix

In [None]:
confusion_matrix(y_train, y_train_pred)

In [None]:
confusion_matrix(y_test, y_test_pred)

In [None]:
accuracy_score(y_test, y_test_pred)

### Plotting decision tree

In [None]:
dot_data = tree.export_graphviz(clftree, out_file=None,feature_names= X_train.columns, filled = True)

In [None]:
from IPython.display import Image

In [None]:
import pydotplus

In [None]:
graph = pydotplus.graph_from_dot_data(dot_data)
Image(graph.create_png())

## Controlling Tree growth

In [None]:
clftree2 = tree.DecisionTreeClassifier(min_samples_leaf = 20, max_depth=4)
clftree2.fit(X_train, y_train)
dot_data = tree.export_graphviz(clftree2, out_file=None,feature_names= X_train.columns, filled = True)
graph2 = pydotplus.graph_from_dot_data(dot_data)
Image(graph2.create_png())

In [None]:
accuracy_score(y_test, clftree2.predict(X_test))

### Bagging

https://scikit-learn.org/stable/modules/generated/sklearn.ensemble.BaggingClassifier.html

In [None]:
from sklearn import tree
clftree = tree.DecisionTreeClassifier()

In [None]:
from sklearn.ensemble import BaggingClassifier

In [None]:
bag_clf = BaggingClassifier(base_estimator=clftree, n_estimators=1000,
                            bootstrap=True, n_jobs=-1,
                            random_state=42)

In [None]:
bag_clf.fit(X_train, y_train)

In [None]:
confusion_matrix(y_test, bag_clf.predict(X_test))

In [None]:
accuracy_score(y_test, bag_clf.predict(X_test))

### Random Forest

https://scikit-learn.org/stable/modules/generated/sklearn.ensemble.RandomForestRegressor.html#sklearn.ensemble.RandomForestRegressor

In [None]:
from sklearn.ensemble import RandomForestClassifier

In [None]:
rf_clf = RandomForestClassifier(n_estimators=1000, n_jobs=-1 ,random_state=42)

In [None]:
rf_clf.fit(X_train, y_train)

In [None]:
confusion_matrix(y_test, rf_clf.predict(X_test))

In [None]:
accuracy_score(y_test, rf_clf.predict(X_test))

### Grid Search

https://scikit-learn.org/stable/modules/generated/sklearn.model_selection.GridSearchCV.html

In [None]:
from sklearn.model_selection import GridSearchCV

In [None]:
rf_clf = RandomForestClassifier(n_estimators=250,random_state=42)

In [None]:
params_grid = {"max_features" : [4,5,6,7,8,9,10],
              "min_samples_split": [2, 3, 10],
              }

In [None]:
grid_search = GridSearchCV(rf_clf, params_grid,
                           n_jobs=-1, cv=5, scoring='accuracy')

In [None]:
grid_search.fit(X_train, y_train)

In [None]:
grid_search.best_params_ 

In [None]:
cvrf_clf = grid_search.best_estimator_

In [None]:
accuracy_score(y_test, cvrf_clf.predict(X_test))

In [None]:
confusion_matrix(y_test, cvrf_clf.predict(X_test))

### Gradient Boosting

https://scikit-learn.org/stable/modules/generated/sklearn.ensemble.GradientBoostingClassifier.html

In [None]:
from sklearn.ensemble import GradientBoostingClassifier

In [None]:
gbc_clf = GradientBoostingClassifier()
gbc_clf.fit(X_train, y_train)

In [None]:
accuracy_score(y_test, gbc_clf.predict(X_test))

In [None]:
gbc_clf2 = GradientBoostingClassifier(learning_rate =0.02, n_estimators =1000, max_depth = 1)
gbc_clf2.fit(X_train, y_train)

In [None]:
accuracy_score(y_train, gbc_clf2.predict(X_train))

In [None]:
accuracy_score(y_test, gbc_clf2.predict(X_test))

### Ada Boost

https://scikit-learn.org/stable/modules/generated/sklearn.ensemble.AdaBoostClassifier.html

In [None]:
from sklearn.ensemble import AdaBoostClassifier

In [None]:
ada_clf = AdaBoostClassifier(learning_rate =0.02, n_estimators =5000)

In [None]:
ada_clf.fit(X_train, y_train)

In [None]:
accuracy_score(y_train, ada_clf.predict(X_train))

In [None]:
accuracy_score(y_test, ada_clf.predict(X_test))

In [None]:
ada_clf2 = AdaBoostClassifier(rf_clf,learning_rate =0.05, n_estimators =500)

In [None]:
ada_clf2.fit(X_train, y_train)

In [None]:
accuracy_score(y_test, ada_clf2.predict(X_test))

### XG Boost

http://xgboost.readthedocs.io/en/latest/

In [None]:
import xgboost as xgb

In [None]:
#### Prameter Tuning

1. General Parameters: Guide the overall functioning
2. Booster Parameters: Guide the individual booster (tree/regression) at each step
3. Learning Task Parameters: Guide the optimization performed

In [None]:
xgb_clf = xgb.XGBClassifier(max_depth=5, n_estimators=10000, learning_rate=0.3,
                            n_jobs=-1)

In [None]:
xgb_clf.fit(X_train, y_train)

In [None]:
accuracy_score(y_test, xgb_clf.predict(X_test))

In [None]:
xgb.plot_importance(xgb_clf)

In [None]:
xgb_clf = xgb.XGBClassifier(n_estimators=250,learning_rate=0.1, random_state=42)

In [None]:
param_test1 = {
 'max_depth':range(3,10,2),
    'gamma' : [0.1,0.2,0.3],
    'subsample':[0.8,0.9],
    'colsample_bytree':[0.8,0.9],
    'reg_alpha':[ 1e-2, 0.1, 1]
}

In [None]:
grid_search = GridSearchCV(xgb_clf, param_test1,
                           n_jobs=-1, cv=5, scoring='accuracy')

In [None]:
grid_search.fit(X_train, y_train)

In [None]:
cvxg_clf = grid_search.best_estimator_

In [None]:
accuracy_score(y_test, cvxg_clf.predict(X_test))

In [None]:
grid_search.best_params_