# Overview
Using this notebook to build and test functions for a future one-tap model script called model.py. 

In [1]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split

In [2]:
# df = pd.read_json('data.json')
# df.to_csv('microsoft_github_readmes.csv')
df = pd.read_csv('cleaned_readmes_v2.csv', index_col=0)
df

Unnamed: 0,repo,language,content
0,microsoft/react-native-windows,C++,h1 aligncenter react native window h1 p alignc...
1,microsoft/fast,TypeScript,fastbannergithub914pnghttpsstaticfastdesignass...
2,microsoft/Application-Insights-Workbooks,JSON,azure monitor workbook template statushttpsgit...
3,microsoft/gctoolkit,Java,microsoft gctoolkit gctoolkit set library anal...
4,microsoft/winget-cli-restsource,C#,welcome wingetclirestsource repository buildin...
...,...,...,...
1371,microsoft/AzureClusterlessHPC.jl,Julia,httpsimgshieldsiobadgedocsstablebluesvghttpsmi...
1372,microsoft/multicluster-gitops,Shell,multicluster multitenant environment flux v2 r...
1373,microsoft/kfp-event-handler,Go,kubeflow pipeline event handler microservice e...
1374,microsoft/Build-Docker-Provider,Shell,builddockerprovider superproject dockerprovide...


In [3]:
df.dropna(subset=['content'], inplace=True)
df['is_TypeScript'] = df.language == 'TypeScript'
df.head(3)

Unnamed: 0,repo,language,content,is_TypeScript
0,microsoft/react-native-windows,C++,h1 aligncenter react native window h1 p alignc...,False
1,microsoft/fast,TypeScript,fastbannergithub914pnghttpsstaticfastdesignass...,True
2,microsoft/Application-Insights-Workbooks,JSON,azure monitor workbook template statushttpsgit...,False


In [4]:
trainvalidate, test = train_test_split(df,stratify=df.is_TypeScript, test_size=.2, random_state=123)
train, validate = train_test_split(trainvalidate,stratify=trainvalidate.is_TypeScript, test_size=.2, random_state=123)

In [5]:
X_train, y_train = train.content, train.is_TypeScript
X_validate, y_validate = validate.content, validate.is_TypeScript
X_test, y_test = test.content, test.is_TypeScript

X_train.shape, y_train.shape

((876,), (876,))

# Curriculum
code from the lesson: imports

In [6]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import RFE
from sklearn.tree import DecisionTreeClassifier

code from the lesson: count vectorization

In [7]:
cv = CountVectorizer()
bag_of_words = cv.fit_transform(train.content)
pd.DataFrame(bag_of_words.todense(), columns=cv.get_feature_names())
# cv.vocabulary_

Unnamed: 0,00,000,0000,000000,00000000,00000000000000000000000000000000,0000000000000000000000000000000000000000,0000000000000000aaaaaaaaaaaaaaaa,000001,000001163jpg,...,zza,zzc1v,zzgkuu5emjwgy,zzhifxzjtfwg6,zzsj51kv8qk65,zzt6iwiwzukzl8rfk,zztazwihbirjbdgrjzg,zzvn2m,zzvr0wx4bb1qgdhdqx2m2q9jqd9,zzxzzrivxr
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
871,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
872,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
873,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
874,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


code from lesson: tf-idf

In [8]:
tfidf = TfidfVectorizer()
bag_of_words = tfidf.fit_transform(train.content)
pd.DataFrame(bag_of_words.todense(), columns=cv.get_feature_names())
# pd.Series(dict(zip(tfidf.get_feature_names(), tfidf.idf_))).sort_values()

Unnamed: 0,00,000,0000,000000,00000000,00000000000000000000000000000000,0000000000000000000000000000000000000000,0000000000000000aaaaaaaaaaaaaaaa,000001,000001163jpg,...,zza,zzc1v,zzgkuu5emjwgy,zzhifxzjtfwg6,zzsj51kv8qk65,zzt6iwiwzukzl8rfk,zztazwihbirjbdgrjzg,zzvn2m,zzvr0wx4bb1qgdhdqx2m2q9jqd9,zzxzzrivxr
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
871,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
872,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
873,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
874,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


code from lesson: count-vectorized modeling

In [9]:
cv = CountVectorizer()
X_bow = cv.fit_transform(X_train)
X_tfidf_validate = cv.transform(X_validate)
X_tfidf_test = cv.transform(X_test)
tree = DecisionTreeClassifier(max_depth=3, random_state=123)
tree.fit(X_bow, y_train)
print(tree.score(X_bow, y_train))
print(tree.score(X_tfidf_validate, y_validate))

0.8481735159817352
0.8454545454545455


code from lesson: feature selection (#1)

In [10]:
pd.Series(dict(zip(cv.get_feature_names(), tree.feature_importances_)))\
.sort_values().tail(5)

quick    0.037062
bit      0.047050
chart    0.119485
yarn     0.180060
npm      0.565266
dtype: float64

code from lesson: tf-idf modeling

In [11]:
tfidf = TfidfVectorizer()
X_tfidf = tfidf.fit_transform(X_train)
X_tfidf_test = tfidf.transform(X_test)
tree.fit(X_tfidf, y_train)
print(tree.score(X_tfidf, y_train))
print(tree.score(X_tfidf_validate, y_validate))

0.8618721461187214
0.8045454545454546


code from lesson: feature selection (#2)

In [12]:
pd.Series(dict(zip(cv.get_feature_names(), tree.feature_importances_)))\
.sort_values().tail(5)

wa           0.051946
end          0.060698
extension    0.171375
chart        0.172654
npm          0.501589
dtype: float64

# Microsoft READMEs
- best model so far: CV tree

models to use:
- randomforest
- logisticregression
- naive bayes
- knn

In [13]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import GaussianNB
from sklearn.neighbors import KNeighborsClassifier

In [14]:
cv = CountVectorizer(ngram_range=(2,2))
X_bow = cv.fit_transform(X_train)
X_tfidf_validate = cv.transform(X_validate)
X_tfidf_test = cv.transform(X_test)
rf = RandomForestClassifier(n_estimators=1000, 
                              max_depth=10, 
                              min_samples_leaf=1,
                              random_state=123)
rf.fit(X_bow, y_train)
print(rf.score(X_bow, y_train))
print(rf.score(X_tfidf_validate, y_validate))

tfidf = TfidfVectorizer(ngram_range=(2,2))
X_tfidf = tfidf.fit_transform(X_train)
X_tfidf_test = tfidf.transform(X_test)
rf.fit(X_tfidf, y_train)
print(rf.score(X_tfidf, y_train))
print(rf.score(X_tfidf_validate, y_validate))

0.7488584474885844
0.740909090909091
0.7454337899543378
0.740909090909091


In [15]:
cv = CountVectorizer()
X_bow = cv.fit_transform(X_train)
X_tfidf_validate = cv.transform(X_validate)
X_tfidf_test = cv.transform(X_test)
logit = LogisticRegression(random_state=123)
logit.fit(X_bow, y_train)
print(logit.score(X_bow, y_train))
print(logit.score(X_tfidf_validate, y_validate))

tfidf = TfidfVectorizer()
X_tfidf = tfidf.fit_transform(X_train)
X_tfidf_test = tfidf.transform(X_test)
logit.fit(X_tfidf, y_train)
print(logit.score(X_tfidf, y_train))
print(logit.score(X_tfidf_validate, y_validate))

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


0.997716894977169
0.8545454545454545
0.8458904109589042
0.8136363636363636


In [16]:
cv = CountVectorizer()
X_bow = cv.fit_transform(X_train).todense()
X_tfidf_validate = cv.transform(X_validate).todense()
X_tfidf_test = cv.transform(X_test).todense()
nb = GaussianNB(var_smoothing=100)
nb.fit(X_bow, y_train)
print(nb.score(X_bow, y_train))
print(nb.score(X_tfidf_validate, y_validate))

tfidf = TfidfVectorizer()
X_tfidf = tfidf.fit_transform(X_train).todense()
X_tfidf_test = tfidf.transform(X_test).todense()
nb.fit(X_tfidf, y_train)
print(nb.score(X_tfidf, y_train))
print(nb.score(X_tfidf_validate, y_validate))

0.7431506849315068
0.740909090909091
0.7431506849315068
0.7818181818181819


In [17]:
cv = CountVectorizer()
X_bow = cv.fit_transform(X_train)
X_tfidf_validate = cv.transform(X_validate)
X_tfidf_test = cv.transform(X_test)
knn = KNeighborsClassifier(n_neighbors=25)
knn.fit(X_bow, y_train)
print(knn.score(X_bow, y_train))
print(knn.score(X_tfidf_validate, y_validate))

tfidf = TfidfVectorizer()
X_tfidf = tfidf.fit_transform(X_train)
X_tfidf_test = tfidf.transform(X_test)
knn.fit(X_tfidf, y_train)
print(knn.score(X_tfidf, y_train))
print(knn.score(X_tfidf_validate, y_validate))

0.769406392694064
0.7636363636363637
0.8253424657534246
0.8227272727272728


In [18]:
cv_df = pd.DataFrame(bag_of_words.todense(), columns=cv.get_feature_names())