# Week 5

<img src="roc.png" width="600">

<img src="pdp.png" width="600">

This we dove into tree-base methods, started learning Natural Langugage Processing, and learned some clustering techniques as well. We had our second case study of Friday and it went a lot smoother than the previous one last week. Not only did we have more knowledge, but we also had better understandings of how to work in a one-day time frame, collaborative coding with Git and Github, and approriate file architecture to this type of project. 

Here's a rundown of what we covered:

* Decision trees
 * Bagged trees
* Random forests
 * Bagging plus random feature sub-setting
* Boosting
 * Adaboost
 * Gradient boosting
 * XGBoost
 * Catboost
* Natural Language Processing
 * Word to vector
 * Bag of words
* Naive Bayes classification
* KMeans clustering
* Hierarchical clustering

Tree method breakdown:

Algorithm | Learner Type | Hyperparameters | Pros | Cons
--- | --- | --- | --- | ---
Decision Trees | Strong | Impurty Function | No scaling, model non-linear relationships, classification and regression, predictions cheap, highly interpretable | expensive to train, needs pruning to avoid overfitting
Random Forests | Strong | Number of trees, number of features at each split, individual tree parameters, tree depth, pruning, split criterion | great performance, often little tuning needed, no feature scaling, model non-linear relationships | expensive to train, hard to interpret effect of each feature
Boosted Trees | weak | number of estimators, learning rate alpha, max depth, max features, subsample amount | squeezes out predictive power, resistant to overfiting, smoother that RF | can't be trained in paralell like RF, could cause overfitting with too many trees

Here is some code we wrote:

```python
import pandas as pd
import numpy as np
from sklearn.base import BaseEstimator, TransformerMixin


def accuracy(y_hat, y):
    """Calculate the accuracy score
    """
    return np.mean(y_hat == y)

def recall(y_hat, y):
    ''' Calculate recall score'''
    return np.mean()
    pass

'''
This script has functions to clean the test and train data
as well as engineer new features, for use in the plots and churn_model
scripts
'''
class DataCleaning(BaseEstimator, TransformerMixin):
#     def get_params(self, **kwargs):
#         pass
        
    def fit(self, df, y):
        return self

    def transform(self, df):
        '''
        INPUT: UNCLEANED PANDAS DF with target label
        OUTPUT: CLEANED PANDAS DF with null value
        '''

        # convert to datetime
        df['last_trip_date'] = pd.to_datetime(df['last_trip_date'])
        df['signup_date'] = pd.to_datetime(df['signup_date'])
        # convert to 1/0
        df['luxury_car_user'] = df['luxury_car_user'].astype(int)

        # add missing value to phone device
        df['phone'].fillna(value='missing', inplace=True)

        # Filling missing values for avg_rating_of_driver
        df['avg_rating_of_driver'].fillna(-1,inplace = True)

        # Filling missing values for avg_rating_by_driver
        df['avg_rating_by_driver'].fillna(-1,inplace = True)

        # Create new colums indicating ratings or non-ratings
        condition_1 = df['avg_rating_of_driver'] == -1 
        df['rating_of_driver'] = 0
        df.loc[~condition_1, 'rating_of_driver'] = 0
        condition_2 = df['avg_rating_by_driver'] == -1 
        df['rating_by_driver'] = 0
        df.loc[~condition_2, 'rating_by_driver'] = 0
        
        cols_to_be_kept = ['avg_dist', 'avg_rating_by_driver', 'rating_by_driver',\
                           'avg_rating_of_driver', 'rating_of_driver',\
                           'avg_surge','city', 'phone', 'surge_pct','trips_in_first_30_days',\
                           'luxury_car_user', 'weekday_pct']
        X = df[cols_to_be_kept]

        cat_cols = ['phone', 'city']
        for col in cat_cols:
            df[col] = df[col].astype('category')
        X = pd.get_dummies(X, columns=cat_cols)
        return X

```

```python
import pandas as pd
import numpy as np


def feature_engineer(data):

    y = data['churn']
    data.drop(['churn'],axis=1,inplace=True)

    cols_to_be_kept = ['avg_dist', 'avg_rating_by_driver', 'rating_by_driver',\
                       'avg_rating_of_driver', 'rating_of_driver',\
                       'avg_surge','city', 'phone', 'surge_pct','trips_in_first_30_days',\
                       'luxury_car_user', 'weekday_pct']
    X = data[cols_to_be_kept]

    cat_cols = ['phone', 'city']
    for col in cat_cols:
        data[col] = data[col].astype('category')
    X = pd.get_dummies(X, columns=cat_cols)
    
    return X, y
```

```python
""" This solution makes heavy use of sklearn's Pipeline class.
    You can find documentation on using this class here:
    http://scikit-learn.org/stable/modules/pipeline.html
"""
from datetime import timedelta
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import make_scorer, confusion_matrix, classification_report
from sklearn.model_selection import PredefinedSplit, GridSearchCV
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
import numpy as np
import pandas as pd
from data_cleaning import DataCleaning
from sklearn.model_selection import train_test_split


def accuracy(y_hat, y):
    """Calculate the accuracy score
    """
    return np.mean(y_hat == y)

def recall(y_hat, y):
    ''' Calculate recall score'''  
    return np.mean()
    pass

if __name__ == '__main__':
    df = pd.read_csv('data/churn_train.csv')
    # creating dependent churn variables
    # labelled customers churned if they hadn't used the service in the last
    # month

    condition = df['last_trip_date'] < '2014-06-01' 
    df['churn'] = 1
    df.loc[~condition, 'churn'] = 0
    y = df['churn']
    clean = DataCleaning()
    df = clean.transform(df)
    
    #p = Pipeline([
    #    ('dc', DataCleaning()),
    #    ('rf', RandomForestClassifier())
    #])
    
    # GridSearch for RF
    params = {'n_estimators': [100, 200, 500],
             'max_depth': [3, 5, 7],
             'max_features': ['auto', 'sqrt', 'log2']}

    gb_params = {'learning_rate': [1],
                'n_estimators' : [100],
                'subsample' : [1],
                'max_depth' : [3],
                'max_features' : ['auto']
    
    }

    log_params = { 'C' : [1, 2, 3, 4, 5]
                
    }

    rf = RandomForestClassifier()
    gb = GradientBoostingClassifier()
    lr = LogisticRegression()

    acc_scorer = make_scorer(accuracy)

    # gscv = GridSearchCV(estimator=rf,
    #                     param_grid=params,
    #                     n_jobs=-1,
    #                     scoring=acc_scorer,
    #                     cv=10)

    gscv = GridSearchCV(estimator=gb,
                    param_grid=gb_params,
                    n_jobs=-1,
                    scoring=acc_scorer,
                    cv=10)
    
    # gscv = GridSearchCV(estimator=lr,
    #             param_grid=lr_params,
    #             n_jobs=-1,
    #             scoring=acc_scorer,
    #             cv=10)

    clf = gscv.fit(df, y)
    
    model = clf.best_estimator_
    
    X_train, X_test, y_train, y_test = train_test_split(df, y, test_size=0.33, random_state=66)
    
    model.fit(X_train, y_train)
    
    predictions = model.predict(X_test)

    print(classification_report(y_test, predictions))
    print(confusion_matrix(y_test, predictions))
    
    '''
    print('Best parameters: {}'.format(clf.best_params_))
    print('Best RMSLE: {}'.format(clf.best_score_))

    test = pd.read_csv('data/test.csv')
    test = test.sort_values(by='SalesID')

    test_predictions = clf.predict(test)
    test['SalePrice'] = test_predictions
    outfile = 'data/solution_benchmark.csv'
    test[['SalesID', 'SalePrice']].to_csv(outfile, index=False)
    '''
    ```