- environment setup

In [1]:
from pprint import pprint

import pandas as pd
import numpy as np

%matplotlib inline
import matplotlib.pyplot as plt

import nltk
import unicodedata
import re

In [2]:
from sklearn.feature_extraction.text import TfidfVectorizer

In [3]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import classification_report, accuracy_score

In [4]:
import prepare

import acquire

In [5]:
import os


In [6]:
import torch

# Wrangle Data

- Acquire

In [7]:
from acquire import acquire_microsoft

In [8]:
df = acquire_microsoft()

In [9]:
df.head()

Unnamed: 0,repo,language,readme_contents,is_TypeScript
0,microsoft/roosterjs-react,TypeScript,\n# Contributing\n\nThis project welcomes cont...,True
1,microsoft/vscode-azure-iot-toolkit,HTML,# Azure IoT Hub\n\n[![Join the chat at https:/...,False
2,microsoft/vscode-azuretools,TypeScript,# VSCode Azure SDK for Node.js\n\n[![Build Sta...,True
3,microsoft/knack,Python,Knack\n=====\n\n.. image:: https://img.shields...,False
4,microsoft/browsecloud,TypeScript,**BrowseCloud - Public Demo**\n\n[Try out Brow...,True


In [10]:
df.shape

(261, 4)

In [11]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 261 entries, 0 to 269
Data columns (total 4 columns):
 #   Column           Non-Null Count  Dtype 
---  ------           --------------  ----- 
 0   repo             261 non-null    object
 1   language         261 non-null    object
 2   readme_contents  261 non-null    object
 3   is_TypeScript    261 non-null    bool  
dtypes: bool(1), object(3)
memory usage: 8.4+ KB


In [12]:
df.isnull().sum()

repo               0
language           0
readme_contents    0
is_TypeScript      0
dtype: int64

- Prepare

In [13]:
from prepare import prepare_microsoft

In [14]:
df.readme_contents = df.readme_contents.apply(prepare_microsoft)

In [15]:
df.head()

Unnamed: 0,repo,language,readme_contents,is_TypeScript
0,microsoft/roosterjs-react,TypeScript,contributing project welcome contribution sugg...,True
1,microsoft/vscode-azure-iot-toolkit,HTML,azure iot hub join chat httpsgitterimmicrosoft...,False
2,microsoft/vscode-azuretools,TypeScript,vscode azure sdk nodejs build statushttpsdevaz...,True
3,microsoft/knack,Python,knack image httpsimgshieldsiopypivknacksvg tar...,False
4,microsoft/browsecloud,TypeScript,browsecloud public demo try browsecloud demons...,True


In [16]:
df.readme_contents[0]

'contributing project welcome contribution suggestion contribution require agree contributor license agreement cla declaring right actually grant u right use contribution detail visit httpsclamicrosoftcom submit pull request clabot automatically determine whether need provide cla decorate pr appropriately eg label comment simply follow instruction provided bot need across repos using cla project ha adopted microsoft open source code conducthttpsopensourcemicrosoftcomcodeofconduct information see code conduct faqhttpsopensourcemicrosoftcomcodeofconductfaq contact opencodemicrosoftcommailtoopencodemicrosoftcom additional question comment'

# Modeling

### TF-IDF 

In [17]:
# make TF-IDF object
tfidf = TfidfVectorizer()

In [18]:
# fit object to list of words
X = tfidf.fit_transform(df.readme_contents)

### Setup

In [19]:
def seed_everything(seed=319):
    """"
    Seed everything.
    """   
    np.random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.cuda.manual_seed_all(seed)
    torch.backends.cudnn.deterministic = True

In [20]:
seed_everything()

In [21]:
np.random.seed(319)

In [22]:
import numpy as np
from scipy.stats import norm
print('Without seed')
print(norm.rvs(100, size = 5))
print(norm.rvs(100, size = 5))

print('With the same seed')
np.random.seed(42) 
print(norm.rvs(100, size = 5))
np.random.seed(42) # reset the random seed back to 42
print(norm.rvs(100, size = 5))

print('Without seed')
np.random.seed(None)
print(norm.rvs(100, size = 5))
print(norm.rvs(100, size = 5))

Without seed
[100.17820269  99.99121479 102.03905608  99.47298494  99.94185588]
[101.3088409  100.41584692  99.14934789  99.11562462  99.93629751]
With the same seed
[100.49671415  99.8617357  100.64768854 101.52302986  99.76584663]
[100.49671415  99.8617357  100.64768854 101.52302986  99.76584663]
Without seed
[100.48004679  97.94668038  99.54537045 100.04552947  97.90894565]
[100.18949146 100.38309751  99.80358534 100.19989723 100.12354108]


In [23]:
# make the y variable of the target variable
y = df.is_TypeScript

y.head()

0     True
1    False
2     True
3    False
4     True
Name: is_TypeScript, dtype: bool

In [24]:
# seperate into train and test
X_train, X_test, y_train, y_test = train_test_split(X, y, stratify=y, test_size=.2, random_state=319)

In [25]:
# set up as dataframes
train = pd.DataFrame(dict(actual=y_train))
test = pd.DataFrame(dict(actual=y_test))

In [26]:
# establish baseline
train.actual.value_counts()

False    108
True     100
Name: actual, dtype: int64

In [27]:
train['baseline'] = False
# most actual values are false, so we will use that as baseline

In [28]:
train.head()

Unnamed: 0,actual,baseline
257,True,False
65,True,False
207,False,False
136,True,False
99,True,False


In [29]:
print(' Baseline Accuracy: {:.2%}'.format(accuracy_score(train.actual, train.baseline)))
print('---')
print('Confusion Matrix')
print(pd.crosstab(train.baseline, train.actual))
print('---')
print(classification_report(train.actual, train.baseline))

 Baseline Accuracy: 51.92%
---
Confusion Matrix
actual    False  True 
baseline              
False       108    100
---
              precision    recall  f1-score   support

       False       0.52      1.00      0.68       108
        True       0.00      0.00      0.00       100

    accuracy                           0.52       208
   macro avg       0.26      0.50      0.34       208
weighted avg       0.27      0.52      0.35       208



  _warn_prf(average, modifier, msg_start, len(result))


## Logistic Regression

In [30]:
# create object and fit to train
lm = LogisticRegression(random_state=319).fit(X_train, y_train)

In [31]:
# make predictions
train['log_predicted'] = lm.predict(X_train)
test['log_predicted'] = lm.predict(X_test)

In [32]:
train.head()

Unnamed: 0,actual,baseline,log_predicted
257,True,False,True
65,True,False,True
207,False,False,False
136,True,False,True
99,True,False,True


In [33]:
print('Accuracy: {:.2%}'.format(accuracy_score(train.actual, train.log_predicted)))
print('---')
print('Confusion Matrix')
print(pd.crosstab(train.log_predicted, train.actual))
print('---')
print(classification_report(train.actual, train.log_predicted))

Accuracy: 97.12%
---
Confusion Matrix
actual         False  True 
log_predicted              
False            105      3
True               3     97
---
              precision    recall  f1-score   support

       False       0.97      0.97      0.97       108
        True       0.97      0.97      0.97       100

    accuracy                           0.97       208
   macro avg       0.97      0.97      0.97       208
weighted avg       0.97      0.97      0.97       208



In [34]:
test.head()

Unnamed: 0,actual,log_predicted
107,False,False
49,False,False
66,False,False
62,False,False
10,False,False


In [35]:
print('Accuracy: {:.2%}'.format(accuracy_score(test.actual, test.log_predicted)))
print('---')
print('Confusion Matrix')
print(pd.crosstab(test.log_predicted, test.actual))
print('---')
print(classification_report(test.actual, test.log_predicted))

Accuracy: 69.81%
---
Confusion Matrix
actual         False  True 
log_predicted              
False             24     13
True               3     13
---
              precision    recall  f1-score   support

       False       0.65      0.89      0.75        27
        True       0.81      0.50      0.62        26

    accuracy                           0.70        53
   macro avg       0.73      0.69      0.68        53
weighted avg       0.73      0.70      0.69        53



- Logistic Regression did very well on train, however comparitively did very poorly on test

- Rather low precision on false

- More accurate than baseline, however

## Decision Tree

In [36]:
# create object and fit to train
dt = DecisionTreeClassifier(random_state=319).fit(X_train, y_train)

In [37]:
# make predictions
train['dt_predicted'] = dt.predict(X_train)
test['dt_predicted'] = dt.predict(X_test)

In [38]:
train.head()

Unnamed: 0,actual,baseline,log_predicted,dt_predicted
257,True,False,True,True
65,True,False,True,True
207,False,False,False,False
136,True,False,True,True
99,True,False,True,True


In [39]:
print('Accuracy: {:.2%}'.format(accuracy_score(train.actual, train.dt_predicted)))
print('---')
print('Confusion Matrix')
print(pd.crosstab(train.dt_predicted, train.actual))
print('---')
print(classification_report(train.actual, train.dt_predicted))

Accuracy: 99.52%
---
Confusion Matrix
actual        False  True 
dt_predicted              
False           108      1
True              0     99
---
              precision    recall  f1-score   support

       False       0.99      1.00      1.00       108
        True       1.00      0.99      0.99       100

    accuracy                           1.00       208
   macro avg       1.00      0.99      1.00       208
weighted avg       1.00      1.00      1.00       208



In [40]:
test.head()

Unnamed: 0,actual,log_predicted,dt_predicted
107,False,False,False
49,False,False,False
66,False,False,False
62,False,False,False
10,False,False,False


In [41]:
print('Accuracy: {:.2%}'.format(accuracy_score(test.actual, test.dt_predicted)))
print('---')
print('Confusion Matrix')
print(pd.crosstab(test.dt_predicted, test.actual))
print('---')
print(classification_report(test.actual, test.dt_predicted))

Accuracy: 66.04%
---
Confusion Matrix
actual        False  True 
dt_predicted              
False            19     10
True              8     16
---
              precision    recall  f1-score   support

       False       0.66      0.70      0.68        27
        True       0.67      0.62      0.64        26

    accuracy                           0.66        53
   macro avg       0.66      0.66      0.66        53
weighted avg       0.66      0.66      0.66        53



- This model did better on train than logistic regression but even worse on test

- low precision on both true and false

- Still more accurate than baseline

## Random Forest

In [42]:
# create object and fit to train
rf = RandomForestClassifier(random_state=319).fit(X_train, y_train)

In [43]:
# make predictions
train['rf_predicted'] = rf.predict(X_train)
test['rf_predicted'] = rf.predict(X_test)

In [44]:
train.head()

Unnamed: 0,actual,baseline,log_predicted,dt_predicted,rf_predicted
257,True,False,True,True,True
65,True,False,True,True,True
207,False,False,False,False,False
136,True,False,True,True,True
99,True,False,True,True,True


In [45]:
print('Accuracy: {:.2%}'.format(accuracy_score(train.actual, train.rf_predicted)))
print('---')
print('Confusion Matrix')
print(pd.crosstab(train.rf_predicted, train.actual))
print('---')
print(classification_report(train.actual, train.rf_predicted))

Accuracy: 99.52%
---
Confusion Matrix
actual        False  True 
rf_predicted              
False           108      1
True              0     99
---
              precision    recall  f1-score   support

       False       0.99      1.00      1.00       108
        True       1.00      0.99      0.99       100

    accuracy                           1.00       208
   macro avg       1.00      0.99      1.00       208
weighted avg       1.00      1.00      1.00       208



In [46]:
test.head()

Unnamed: 0,actual,log_predicted,dt_predicted,rf_predicted
107,False,False,False,False
49,False,False,False,False
66,False,False,False,False
62,False,False,False,False
10,False,False,False,False


In [47]:
print('Accuracy: {:.2%}'.format(accuracy_score(test.actual, test.rf_predicted)))
print('---')
print('Confusion Matrix')
print(pd.crosstab(test.rf_predicted, test.actual))
print('---')
print(classification_report(test.actual, test.rf_predicted))

Accuracy: 62.26%
---
Confusion Matrix
actual        False  True 
rf_predicted              
False            20     13
True              7     13
---
              precision    recall  f1-score   support

       False       0.61      0.74      0.67        27
        True       0.65      0.50      0.57        26

    accuracy                           0.62        53
   macro avg       0.63      0.62      0.62        53
weighted avg       0.63      0.62      0.62        53



- Once again this model did very well on train but poorly on test 

- poor precision on both true and false

- poor recall on true

- slightly more accurate than baseline

## K-Nearest Neighbor

In [48]:
# create object and fit to train
knn = KNeighborsClassifier().fit(X_train, y_train)

In [49]:
# make predictions
train['knn_predicted'] = knn.predict(X_train)
test['knn_predicted'] = knn.predict(X_test)

In [50]:
train.head()

Unnamed: 0,actual,baseline,log_predicted,dt_predicted,rf_predicted,knn_predicted
257,True,False,True,True,True,True
65,True,False,True,True,True,True
207,False,False,False,False,False,False
136,True,False,True,True,True,True
99,True,False,True,True,True,True


In [51]:
print('Accuracy: {:.2%}'.format(accuracy_score(train.actual, train.knn_predicted)))
print('---')
print('Confusion Matrix')
print(pd.crosstab(train.knn_predicted, train.actual))
print('---')
print(classification_report(train.actual, train.knn_predicted))

Accuracy: 78.37%
---
Confusion Matrix
actual         False  True 
knn_predicted              
False             89     26
True              19     74
---
              precision    recall  f1-score   support

       False       0.77      0.82      0.80       108
        True       0.80      0.74      0.77       100

    accuracy                           0.78       208
   macro avg       0.78      0.78      0.78       208
weighted avg       0.78      0.78      0.78       208



In [52]:
test.head()

Unnamed: 0,actual,log_predicted,dt_predicted,rf_predicted,knn_predicted
107,False,False,False,False,False
49,False,False,False,False,False
66,False,False,False,False,True
62,False,False,False,False,False
10,False,False,False,False,True


In [53]:
print('Accuracy: {:.2%}'.format(accuracy_score(test.actual, test.knn_predicted)))
print('---')
print('Confusion Matrix')
print(pd.crosstab(test.knn_predicted, test.actual))
print('---')
print(classification_report(test.actual, test.knn_predicted))

Accuracy: 69.81%
---
Confusion Matrix
actual         False  True 
knn_predicted              
False             21     10
True               6     16
---
              precision    recall  f1-score   support

       False       0.68      0.78      0.72        27
        True       0.73      0.62      0.67        26

    accuracy                           0.70        53
   macro avg       0.70      0.70      0.70        53
weighted avg       0.70      0.70      0.70        53



- This model didn't do as well on train, however did about just as well on test

- more accurate than basline

# Modeling on all languages

In [54]:
df.head()

Unnamed: 0,repo,language,readme_contents,is_TypeScript
0,microsoft/roosterjs-react,TypeScript,contributing project welcome contribution sugg...,True
1,microsoft/vscode-azure-iot-toolkit,HTML,azure iot hub join chat httpsgitterimmicrosoft...,False
2,microsoft/vscode-azuretools,TypeScript,vscode azure sdk nodejs build statushttpsdevaz...,True
3,microsoft/knack,Python,knack image httpsimgshieldsiopypivknacksvg tar...,False
4,microsoft/browsecloud,TypeScript,browsecloud public demo try browsecloud demons...,True


In [55]:
# find only the most common languages
top_5 = df.language.value_counts().head()

In [56]:
top_5

TypeScript    126
C#             32
JavaScript     18
Python         16
C++            16
Name: language, dtype: int64

In [57]:
top_5_languages = ['TypeScript', 'C#', 'JavaScript', 'C++', 'Python']

In [58]:
# make into a seperate dataframe
top_5_df = df[df['language'].isin(top_5_languages)]

In [59]:
top_5_df.head()

Unnamed: 0,repo,language,readme_contents,is_TypeScript
0,microsoft/roosterjs-react,TypeScript,contributing project welcome contribution sugg...,True
2,microsoft/vscode-azuretools,TypeScript,vscode azure sdk nodejs build statushttpsdevaz...,True
3,microsoft/knack,Python,knack image httpsimgshieldsiopypivknacksvg tar...,False
4,microsoft/browsecloud,TypeScript,browsecloud public demo try browsecloud demons...,True
5,microsoft/FluidFramework,TypeScript,fluid fluid framework typescript library build...,True


In [60]:
top_5_df.language.value_counts()

TypeScript    126
C#             32
JavaScript     18
Python         16
C++            16
Name: language, dtype: int64

In [61]:
from prepare import split

In [62]:
train, validate, test = split(top_5_df, 'language')
train.head()

Unnamed: 0,repo,language,readme_contents,is_TypeScript
33,microsoft/cookie.gulp,JavaScript,contributing project welcome contribution sugg...,False
39,microsoft/electionguard-ballot-box,TypeScript,microsoft defending democracy program election...,True
47,microsoft/onnxruntime,C++,p aligncenterimg width50 srcdocsimagesonnxrunt...,False
238,microsoft/fhir-server,C#,fhir server azure net core implementation fhir...,False
130,microsoft/openpaimarketplace,JavaScript,p aligncenter img srcdocsimagesmarketplacesvg ...,False


In [63]:
# Establish baseline
train.language.value_counts()

TypeScript    71
C#            17
JavaScript    10
Python         9
C++            9
Name: language, dtype: int64

In [64]:
train['baseline'] = 'TypeScript'
# 'TypeScript' will be used as baseline because it is the most common

In [65]:
train.head()

Unnamed: 0,repo,language,readme_contents,is_TypeScript,baseline
33,microsoft/cookie.gulp,JavaScript,contributing project welcome contribution sugg...,False,TypeScript
39,microsoft/electionguard-ballot-box,TypeScript,microsoft defending democracy program election...,True,TypeScript
47,microsoft/onnxruntime,C++,p aligncenterimg width50 srcdocsimagesonnxrunt...,False,TypeScript
238,microsoft/fhir-server,C#,fhir server azure net core implementation fhir...,False,TypeScript
130,microsoft/openpaimarketplace,JavaScript,p aligncenter img srcdocsimagesmarketplacesvg ...,False,TypeScript


In [66]:
languages = train.language.value_counts().index.tolist()

languages

['TypeScript', 'C#', 'JavaScript', 'Python', 'C++']

In [67]:
# Train Accuracy
(train.language == train.baseline).mean()

0.6120689655172413

In [68]:
for language in languages:
    language_repos = train[train.language == language]
    accuracy = (language_repos.language == language_repos.baseline).mean()
    print(f"Predicting {language} on baseline has {round(accuracy, 2)}")

Predicting TypeScript on baseline has 1.0
Predicting C# on baseline has 0.0
Predicting JavaScript on baseline has 0.0
Predicting Python on baseline has 0.0
Predicting C++ on baseline has 0.0


In [69]:
print(classification_report(train.language, train.baseline))

              precision    recall  f1-score   support

          C#       0.00      0.00      0.00        17
         C++       0.00      0.00      0.00         9
  JavaScript       0.00      0.00      0.00        10
      Python       0.00      0.00      0.00         9
  TypeScript       0.61      1.00      0.76        71

    accuracy                           0.61       116
   macro avg       0.12      0.20      0.15       116
weighted avg       0.37      0.61      0.46       116



  _warn_prf(average, modifier, msg_start, len(result))


In [70]:
# Setup our X variables
X_train = train.readme_contents
X_validate = validate.readme_contents
X_test = test.readme_contents

In [71]:
X_train.head()

33     contributing project welcome contribution sugg...
39     microsoft defending democracy program election...
47     p aligncenterimg width50 srcdocsimagesonnxrunt...
238    fhir server azure net core implementation fhir...
130    p aligncenter img srcdocsimagesmarketplacesvg ...
Name: readme_contents, dtype: object

In [72]:
# Setup our y variables
y_train = train.language
y_validate = validate.language
y_test = test.language

In [73]:
y_train.head()

33     JavaScript
39     TypeScript
47            C++
238            C#
130    JavaScript
Name: language, dtype: object

In [74]:
# Create TF-IDF object
tfidf = TfidfVectorizer()

In [75]:
# Fit on the training data
tfidf.fit(X_train)

TfidfVectorizer()

In [76]:
# Use the object
X_train_vectorized = tfidf.transform(X_train)
X_validate_vectorized = tfidf.transform(X_validate)
X_test_vectorized = tfidf.transform(X_test)

## Logistic Regression

In [77]:
# make logistic regression object
lm = LogisticRegression(random_state = 319)

In [78]:
# fit object to train data
lm.fit(X_train_vectorized, y_train)

LogisticRegression(random_state=319)

In [79]:
# make into pd dataframes
train = pd.DataFrame(dict(actual=y_train))
validate = pd.DataFrame(dict(actual=y_validate))
test = pd.DataFrame(dict(actual=y_test))

In [80]:
train.head()

Unnamed: 0,actual
33,JavaScript
39,TypeScript
47,C++
238,C#
130,JavaScript


In [81]:
# make predictions, add to dataframe
train['log_predicted'] = lm.predict(X_train_vectorized)
validate['log_predicted'] = lm.predict(X_validate_vectorized)
test['log_predicted'] = lm.predict(X_test_vectorized)

In [82]:
train.head()

Unnamed: 0,actual,log_predicted
33,JavaScript,TypeScript
39,TypeScript,TypeScript
47,C++,TypeScript
238,C#,TypeScript
130,JavaScript,TypeScript


In [83]:
train.log_predicted.value_counts()

TypeScript    114
C#              2
Name: log_predicted, dtype: int64

In [84]:
# Train Accuracy
(train.actual == train.log_predicted).mean()

0.6293103448275862

In [85]:
for language in languages:
    language_repos = train[train.actual == language]
    accuracy = (language_repos.actual == language_repos.log_predicted).mean()
    print(f"Predicting {language} has {round(accuracy, 2)}")

Predicting TypeScript has 1.0
Predicting C# has 0.12
Predicting JavaScript has 0.0
Predicting Python has 0.0
Predicting C++ has 0.0


In [86]:
print(classification_report(train.actual, train.log_predicted))

              precision    recall  f1-score   support

          C#       1.00      0.12      0.21        17
         C++       0.00      0.00      0.00         9
  JavaScript       0.00      0.00      0.00        10
      Python       0.00      0.00      0.00         9
  TypeScript       0.62      1.00      0.77        71

    accuracy                           0.63       116
   macro avg       0.32      0.22      0.20       116
weighted avg       0.53      0.63      0.50       116



  _warn_prf(average, modifier, msg_start, len(result))


In [87]:
# Out of sample accuracy
(validate.actual == validate.log_predicted).mean()

0.6

In [88]:
for language in languages:
    language_repos = validate[validate.actual == language]
    accuracy = (language_repos.actual == language_repos.log_predicted).mean()
    print(f"Predicting {language} has {round(accuracy, 2)}")

Predicting TypeScript has 1.0
Predicting C# has 0.0
Predicting JavaScript has 0.0
Predicting Python has 0.0
Predicting C++ has 0.0


In [89]:
print(classification_report(validate.actual, validate.log_predicted))

              precision    recall  f1-score   support

          C#       0.00      0.00      0.00         8
         C++       0.00      0.00      0.00         4
  JavaScript       0.00      0.00      0.00         4
      Python       0.00      0.00      0.00         4
  TypeScript       0.60      1.00      0.75        30

    accuracy                           0.60        50
   macro avg       0.12      0.20      0.15        50
weighted avg       0.36      0.60      0.45        50



- Logistice Regression appeared to predict almost all repos in train were typescript, basically baseline

- not very good performance overall

- barely underperforms compared to baseline

## Decision Tree

In [90]:
# make decision tree object
dt = DecisionTreeClassifier(random_state = 319)

In [91]:
# fit object to train data
dt.fit(X_train_vectorized, y_train)

DecisionTreeClassifier(random_state=319)

In [92]:
# make predictions, add to dataframe
train['dt_predicted'] = dt.predict(X_train_vectorized)
validate['dt_predicted'] = dt.predict(X_validate_vectorized)
test['dt_predicted'] = dt.predict(X_test_vectorized)

In [93]:
train.head()

Unnamed: 0,actual,log_predicted,dt_predicted
33,JavaScript,TypeScript,JavaScript
39,TypeScript,TypeScript,TypeScript
47,C++,TypeScript,C++
238,C#,TypeScript,C#
130,JavaScript,TypeScript,JavaScript


In [94]:
train.dt_predicted.value_counts()

TypeScript    70
C#            16
JavaScript    12
Python         9
C++            9
Name: dt_predicted, dtype: int64

In [95]:
# Train Accuracy
(train.actual == train.dt_predicted).mean()

0.9827586206896551

In [96]:
for language in languages:
    language_repos = train[train.actual == language]
    accuracy = (language_repos.actual == language_repos.dt_predicted).mean()
    print(f"Predicting {language} has {round(accuracy, 2)}")

Predicting TypeScript has 0.99
Predicting C# has 0.94
Predicting JavaScript has 1.0
Predicting Python has 1.0
Predicting C++ has 1.0


In [97]:
print(classification_report(train.actual, train.dt_predicted))

              precision    recall  f1-score   support

          C#       1.00      0.94      0.97        17
         C++       1.00      1.00      1.00         9
  JavaScript       0.83      1.00      0.91        10
      Python       1.00      1.00      1.00         9
  TypeScript       1.00      0.99      0.99        71

    accuracy                           0.98       116
   macro avg       0.97      0.99      0.97       116
weighted avg       0.99      0.98      0.98       116



In [98]:
# Out of sample accuracy
(validate.actual == validate.dt_predicted).mean()

0.64

In [99]:
for language in languages:
    language_repos = validate[validate.actual == language]
    accuracy = (language_repos.actual == language_repos.dt_predicted).mean()
    print(f"Predicting {language} has {round(accuracy, 2)}")

Predicting TypeScript has 0.8
Predicting C# has 0.75
Predicting JavaScript has 0.0
Predicting Python has 0.25
Predicting C++ has 0.25


In [100]:
print(classification_report(validate.actual, validate.dt_predicted))

              precision    recall  f1-score   support

          C#       0.86      0.75      0.80         8
         C++       0.25      0.25      0.25         4
  JavaScript       0.00      0.00      0.00         4
      Python       0.11      0.25      0.15         4
  TypeScript       0.83      0.80      0.81        30

    accuracy                           0.64        50
   macro avg       0.41      0.41      0.40        50
weighted avg       0.66      0.64      0.65        50



- this model performed almost perfectly on train, and while it didn't do so well on validate, it still outperformed logistic regression

- slightly outperforms baseline

## Random Forest

In [101]:
# make random forest object
rf = RandomForestClassifier(random_state = 319)

In [102]:
# fit object to train data
rf.fit(X_train_vectorized, y_train)

RandomForestClassifier(random_state=319)

In [103]:
# make predictions, add to dataframe
train['rf_predicted'] = rf.predict(X_train_vectorized)
validate['rf_predicted'] = rf.predict(X_validate_vectorized)
test['rf_predicted'] = rf.predict(X_test_vectorized)

In [104]:
train.head()

Unnamed: 0,actual,log_predicted,dt_predicted,rf_predicted
33,JavaScript,TypeScript,JavaScript,JavaScript
39,TypeScript,TypeScript,TypeScript,TypeScript
47,C++,TypeScript,C++,C++
238,C#,TypeScript,C#,C#
130,JavaScript,TypeScript,JavaScript,JavaScript


In [105]:
train.rf_predicted.value_counts()

TypeScript    70
C#            16
JavaScript    12
Python         9
C++            9
Name: rf_predicted, dtype: int64

In [106]:
# Train Accuracy
(train.actual == train.rf_predicted).mean()

0.9827586206896551

In [107]:
for language in languages:
    language_repos = train[train.actual == language]
    accuracy = (language_repos.actual == language_repos.rf_predicted).mean()
    print(f"Predicting {language} has {round(accuracy, 2)}")

Predicting TypeScript has 0.99
Predicting C# has 0.94
Predicting JavaScript has 1.0
Predicting Python has 1.0
Predicting C++ has 1.0


In [108]:
print(classification_report(train.actual, train.rf_predicted))

              precision    recall  f1-score   support

          C#       1.00      0.94      0.97        17
         C++       1.00      1.00      1.00         9
  JavaScript       0.83      1.00      0.91        10
      Python       1.00      1.00      1.00         9
  TypeScript       1.00      0.99      0.99        71

    accuracy                           0.98       116
   macro avg       0.97      0.99      0.97       116
weighted avg       0.99      0.98      0.98       116



In [109]:
# Out of sample accuracy
(validate.actual == validate.rf_predicted).mean()

0.54

In [110]:
for language in languages:
    language_repos = validate[validate.actual == language]
    accuracy = (language_repos.actual == language_repos.rf_predicted).mean()
    print(f"Predicting {language} has {round(accuracy, 2)}")

Predicting TypeScript has 0.9
Predicting C# has 0.0
Predicting JavaScript has 0.0
Predicting Python has 0.0
Predicting C++ has 0.0


In [111]:
print(classification_report(validate.actual, validate.rf_predicted))

              precision    recall  f1-score   support

          C#       0.00      0.00      0.00         8
         C++       0.00      0.00      0.00         4
  JavaScript       0.00      0.00      0.00         4
      Python       0.00      0.00      0.00         4
  TypeScript       0.57      0.90      0.70        30

    accuracy                           0.54        50
   macro avg       0.11      0.18      0.14        50
weighted avg       0.34      0.54      0.42        50



  _warn_prf(average, modifier, msg_start, len(result))


In [112]:
validate.rf_predicted.value_counts()

TypeScript    47
C#             3
Name: rf_predicted, dtype: int64

- this model did very well on test, however on out of sample data it seemed to predict almomst everything was TypeScript which severly impacted its accuracy

- does not outperform baseline

## K-Nearest Neighbor

In [113]:
# make knn object
knn = KNeighborsClassifier()

In [114]:
# fit object to train data
knn.fit(X_train_vectorized, y_train)

KNeighborsClassifier()

In [115]:
# make predictions, add to dataframe
train['knn_predicted'] = knn.predict(X_train_vectorized)
validate['knn_predicted'] = knn.predict(X_validate_vectorized)
test['knn_predicted'] = knn.predict(X_test_vectorized)

In [116]:
train.head()

Unnamed: 0,actual,log_predicted,dt_predicted,rf_predicted,knn_predicted
33,JavaScript,TypeScript,JavaScript,JavaScript,C#
39,TypeScript,TypeScript,TypeScript,TypeScript,TypeScript
47,C++,TypeScript,C++,C++,C++
238,C#,TypeScript,C#,C#,C#
130,JavaScript,TypeScript,JavaScript,JavaScript,JavaScript


In [117]:
train.knn_predicted.value_counts()

TypeScript    69
C#            21
JavaScript    16
Python         6
C++            4
Name: knn_predicted, dtype: int64

In [118]:
# Train Accuracy
(train.actual == train.knn_predicted).mean()

0.7155172413793104

In [119]:
for language in languages:
    language_repos = train[train.actual == language]
    accuracy = (language_repos.actual == language_repos.knn_predicted).mean()
    print(f"Predicting {language} has {round(accuracy, 2)}")

Predicting TypeScript has 0.82
Predicting C# has 0.82
Predicting JavaScript has 0.3
Predicting Python has 0.56
Predicting C++ has 0.33


In [120]:
print(classification_report(train.actual, train.knn_predicted))

              precision    recall  f1-score   support

          C#       0.67      0.82      0.74        17
         C++       0.75      0.33      0.46         9
  JavaScript       0.19      0.30      0.23        10
      Python       0.83      0.56      0.67         9
  TypeScript       0.84      0.82      0.83        71

    accuracy                           0.72       116
   macro avg       0.66      0.57      0.58       116
weighted avg       0.75      0.72      0.72       116



In [121]:
# Out of sample accuracy
(validate.actual == validate.knn_predicted).mean()

0.44

In [122]:
for language in languages:
    language_repos = validate[validate.actual == language]
    accuracy = (language_repos.actual == language_repos.knn_predicted).mean()
    print(f"Predicting {language} has {round(accuracy, 2)}")

Predicting TypeScript has 0.6
Predicting C# has 0.12
Predicting JavaScript has 0.0
Predicting Python has 0.25
Predicting C++ has 0.5


In [123]:
print(classification_report(validate.actual, validate.knn_predicted))

              precision    recall  f1-score   support

          C#       0.25      0.12      0.17         8
         C++       0.67      0.50      0.57         4
  JavaScript       0.00      0.00      0.00         4
      Python       0.50      0.25      0.33         4
  TypeScript       0.62      0.60      0.61        30

    accuracy                           0.44        50
   macro avg       0.41      0.30      0.34        50
weighted avg       0.51      0.44      0.47        50



- this model performed poorly on both train and validate data compared to the other models

- very low accuracy on all languages

- severly underperforms compared to baseline

## Evaluate 

- The Decision Tree using default settings performed the best on validate data and was the only model to beat baseline, so I will use that model on the unseen test data

In [124]:
test.head()

Unnamed: 0,actual,log_predicted,dt_predicted,rf_predicted,knn_predicted
106,TypeScript,TypeScript,TypeScript,TypeScript,JavaScript
261,C#,TypeScript,JavaScript,JavaScript,C#
51,TypeScript,TypeScript,Python,TypeScript,TypeScript
16,TypeScript,TypeScript,C++,TypeScript,TypeScript
55,TypeScript,TypeScript,TypeScript,TypeScript,TypeScript


In [125]:
test.actual.value_counts()

TypeScript    25
C#             7
JavaScript     4
Python         3
C++            3
Name: actual, dtype: int64

In [126]:
# test accuracy
(test.actual == test.dt_predicted).mean()

0.5714285714285714

In [127]:
for language in languages:
    language_repos = test[test.actual == language]
    accuracy = (language_repos.actual == language_repos.dt_predicted).mean()
    print(f"Predicting {language} has {round(accuracy, 2)}")

Predicting TypeScript has 0.72
Predicting C# has 0.43
Predicting JavaScript has 0.0
Predicting Python has 0.67
Predicting C++ has 0.33


In [128]:
print(classification_report(test.actual, test.dt_predicted))

              precision    recall  f1-score   support

          C#       0.75      0.43      0.55         7
         C++       0.17      0.33      0.22         3
  JavaScript       0.00      0.00      0.00         4
      Python       0.40      0.67      0.50         3
  TypeScript       0.72      0.72      0.72        25

    accuracy                           0.57        42
   macro avg       0.41      0.43      0.40        42
weighted avg       0.59      0.57      0.57        42



- performed very poorly on unseen test data, did not beat baseline

- may need to try to pull more repositories for future iterations