The following cells of code are part of a Random Forest model that is completed on edX data from Kaggle.

In [2]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import mean_squared_error

In [3]:
# Imports train/test data. Converts grade to a numeric value.
train_data = pd.read_csv('data/edx_train.csv')
test_data = pd.read_csv('data/edx_test.csv')
train_data['grade'] = pd.to_numeric(train_data.grade, errors = 'coerce')
test_data['grade'] = pd.to_numeric(test_data.grade, errors = 'coerce')

In [4]:
# In this model, we use a greater number of features, improving the amount of data we can use, by assigning non-numeric
# colums to dummy Pandas values.
features = ['registered',
            'viewed',
            'explored',
            'final_cc_cname_DI',
            'LoE_DI',
            'YoB',
            'grade',
            'nevents',
            'ndays_act',
            'nplay_video',
            'nchapters',
            'nforum_posts'
            ]
target = 'certified'

train_dummy = pd.get_dummies(train_data[features + [target]])
test_dummy = pd.get_dummies(test_data)

# Splits data into train/val sets, at a test size of 0.1
train_rf, val_rf = train_test_split(train_dummy, test_size = 0.1)

# Updates features list with new dummy variable column names.
features = list(train_rf.columns)
features.remove(target)

#  Updates train/validation sets with updated features, accounts for NA values
train_rf[features] = train_rf[features].fillna(0)
val_rf[features] = val_rf[features].fillna(0)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self[k1] = value[k2]


In [5]:
# Creating Random Forest model and fitting it with data
model = RandomForestClassifier(max_depth = 2)
model = model.fit(train_rf[features], train_rf[target])

# Predicts outcome values based on model and calculates an accuracy score for the model
pred_y = model.predict(val_rf[features])
acc_score = accuracy_score(val_rf[target], pred_y)
print("Accuracy Score: " + str(acc_score))

Accuracy Score: 0.9726027397260274


As demonstrated by the accuracy score of this model versus the last, we see an improvement in the prediction ability of the model. However, we may be able to improve this model!

The following is an updated version of the Random Forest Model, created with a more controlled set of features, as well as tested with different max_depth values for the model.

In [17]:
features_update = ['registered',
            'viewed',
            'explored',
            'LoE_DI',
            'YoB',
            'grade',
            'nevents',
            'ndays_act',
            'nplay_video',
            'nchapters',
            'nforum_posts'
            ]
target = 'certified'

# Create dummies for non-numeric features 
train_dummy = pd.get_dummies(train_data[features_update + [target]])
test_dummy = pd.get_dummies(test_data)

# Train/val split
train_rf, val_rf = train_test_split(train_dummy, test_size = 0.2)

# Collect dummy column names
features_update = list(train_rf.columns)
features_update.remove(target)

# Updates train/val with dummies and accounts for NA values
train_rf[features_update] = train_rf[features_update].fillna(0)
val_rf[features_update] = val_rf[features_update].fillna(0)




A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self[k1] = value[k2]


The cell below iterates through a range of values for max depth in order to find the one which produces the higheest accuracy score. 

In [23]:
# Establishes variables for loop
acc = []
depth = list(range(1,6))
count = 1

# Loop iterates through depths. It creates a model, predicts values, and calculates an accuracy score for i in depths. 
for i in depth:
    model = RandomForestClassifier(max_depth = i)
    model = model.fit(train_rf[features_update], train_rf[target])
    y_pred = model.predict(val_rf[features_update])
    acc_score = accuracy_score(y_pred, val_rf[target])
    acc.append((count, acc_score))
    count = count + 1

# Converts the list of data to a Pandas dataframe and gives it column nanes to make the data more interpretable.
acc_table = pd.DataFrame(acc)
acc_table.columns = ['max_depth', 'validation accuracy score']
acc_table

Unnamed: 0,max_depth,validation accuracy score
0,1,0.97774
1,2,0.981164
2,3,0.997146
3,4,0.997146
4,5,0.997146
