In [1]:
import pandas as pd

from sklearn.metrics import classification_report, accuracy_score

import modeling as m

#ignore minor warnings
import warnings
warnings.filterwarnings("ignore")

# Key takeaways

- The data is vectorized, taking the top 500 features, and combinations of words from word to quadgrams
- A series of classification models is used, along with an ensemble models of Random Forests and BaggingClassifier (using LogisticRegression, KNNeighbors, DecisionTrees, and ExtraTreesClassifier)
- the best model is the BaggingClassifier using the DecisionTree with an accuracy of 0.63
- The models are overfitting to the train and validate data; however, the models remain more accurate than the baseline. Possibly gathering more data would be useful to improve the model, or running more hyperparameters on the TfidfVectorizer.

# Data Eater (MMM = Mass Model Maker)

In [2]:
df_master = pd.read_csv('master_list.csv')
train, validate, test = m.make_model_dfs(df_master)

In [3]:
#make a baseline model
baseline_model, baseline_accuracy = m.baseline_model_maker(train, validate)
baseline_model

Unnamed: 0,model,train_accuracy,validate_accuracy
0,Baseline Model,0.507415,0.507836


In [4]:
models_df = m.model_maker(train, validate, baseline_accuracy)

starting rf and et
finished rf and et


In [5]:
models_df.sort_values(['better_than_baseline', 'validate_accuracy'], ascending = False).head(25)

Unnamed: 0,model,attributes,train_accuracy,validate_accuracy,better_than_baseline
124,BaggingClassifier,estimator = ExtraTreesClassifier,0.976695,0.896654,True
123,BaggingClassifier,estimator = DecisionTreeClassifier,0.965042,0.891995,True
1,KNNeighbors,n_neighbors = 1,0.989407,0.890301,True
48,Decision Tree Classifier,max_depth = 24,0.92161,0.839898,True
46,Decision Tree Classifier,max_depth = 23,0.917903,0.837357,True
44,Decision Tree Classifier,max_depth = 22,0.912606,0.835663,True
42,Decision Tree Classifier,max_depth = 21,0.903072,0.82931,True
40,Decision Tree Classifier,max_depth = 20,0.89089,0.814909,True
38,Decision Tree Classifier,max_depth = 19,0.879767,0.808132,True
36,Decision Tree Classifier,max_depth = 18,0.863877,0.798814,True


In [6]:
models_df['change'] = models_df['train_accuracy'] - models_df['validate_accuracy']
models_df.sort_values(['validate_accuracy','better_than_baseline'], ascending=False).head(50)

Unnamed: 0,model,attributes,train_accuracy,validate_accuracy,better_than_baseline,change
124,BaggingClassifier,estimator = ExtraTreesClassifier,0.976695,0.896654,True,0.080041
123,BaggingClassifier,estimator = DecisionTreeClassifier,0.965042,0.891995,True,0.073047
1,KNNeighbors,n_neighbors = 1,0.989407,0.890301,True,0.099106
48,Decision Tree Classifier,max_depth = 24,0.92161,0.839898,True,0.081712
46,Decision Tree Classifier,max_depth = 23,0.917903,0.837357,True,0.080545
44,Decision Tree Classifier,max_depth = 22,0.912606,0.835663,True,0.076943
42,Decision Tree Classifier,max_depth = 21,0.903072,0.82931,True,0.073762
40,Decision Tree Classifier,max_depth = 20,0.89089,0.814909,True,0.075981
38,Decision Tree Classifier,max_depth = 19,0.879767,0.808132,True,0.071635
36,Decision Tree Classifier,max_depth = 18,0.863877,0.798814,True,0.065063


In [19]:
models_df[models_df['better_than_baseline'] == True].shape, models_df.shape

((54, 6), (125, 6))

In [7]:
m.test_model(train, validate, test, baseline_accuracy)

Unnamed: 0,model,attributes,train_accuracy,validate_accuracy,test_accuracy,better_than_baseline
0,BaggingClassifier,estimator = DecisionTreeClassifier,0.965042,0.891995,0.631512,True


# Dev notes

In [8]:
# df_master = pd.read_csv('master_list.csv')
# df_master

In [9]:
# # take only the top langauges
# df_master = m.filter_languages(df_master)
# df_master.language.value_counts()

In [10]:
#get a clean readme corpus to pass into 
# df_master['cleaned_readme'] = df_master.readme_contents.apply(m.clean_data)
# df_master

In [11]:
# drop nulls
# df_master = m.drop_nulls(df_master)
# df_master.shape

In [12]:
# split data
# train, validate, test = m.split_data(df_master)
# train.shape, validate.shape, test.shape

In [13]:
# X_train, y_train, X_validate, y_validate = m.make_X_y_df(train, validate)

In [14]:
# pd.DataFrame(X_validate.todense())

In [15]:
# pd.DataFrame(X_train.todense())

In [16]:
# m.make_log_reg_model(train, validate, 0.52)