In [1]:
import warnings
warnings.filterwarnings("ignore")

import pandas as pd
import numpy as np
import re
import requests
from bs4 import BeautifulSoup as bs
import json
import matplotlib.pyplot as plt
import seaborn as sns
import nltk
from wordcloud import WordCloud
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, accuracy_score
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier, plot_tree

import acquire as ac
import prepare as pr
import wrangle as wr

# Acquiring Data:

In [2]:
shared_word_list = ['cat', 'use', 'image', 'using', 'file', 'run']
X_train, y_train, X_validate, y_validate, X_test, y_test, df, df_languages = wr.cat_wrangle(extra_words = (shared_word_list))

Removed 60 rows with empty Readmes.
Removed 197 rows with Readmes < 10 words long.


NameError: name 'df_analysis' is not defined

In [None]:
train, validate, test = wr.splitter(df)

In [None]:
train.head()

# Preparing Data:

## Cleaning the original DataFrame: 
- Creates rows for repo name, top language, the contents of the readme, and the cleaned text of the readmes:
    - Removes newlines, urls, and words that are longer than 14 characters.
    - Makes it all lowercase, tokenizes the words, and then lemmatizes (or stems) them.

#### Removing any records that are fewer than 11 words (leaves 743 records left):

# Exploration
- My focus is on bigrams and word visualizations.

In [None]:
train.language_group.value_counts()

## Creating WordClouds:

### Python:

#### Creating separate DataFrame of Python-lead READMEs:

In [None]:
python_df = train[train.language_group == 'Python']

In [None]:
python_df.head(2)

#### Joining all the text from Python READMEs into a single group:

In [None]:
python_text = ''
for i in range(python_df.shape[0]):
    python_text = python_text + python_df.cleaned.iloc[i]
    
python_text[0:200]

#### Creating a WordCloud from all of the Python text:

In [None]:
img = WordCloud(background_color='white').generate(python_text)
# WordCloud() produces an image object, which can be displayed with plt.imshow
plt.imshow(img)
# axis aren't very useful for a word cloud
plt.axis('off')

#### Creating bigrams from the text:

In [None]:
bigrams = nltk.ngrams(python_text.split(), 2)
list(bigrams)

In [None]:
top_20_python_bigrams = (pd.Series(nltk.ngrams(python_text.split(), 2))
                      .value_counts()
                      .head(20))

top_20_python_bigrams.head()

In [None]:
data = {k[0] + ' ' + k[1]: v for k, v in top_20_python_bigrams.to_dict().items()}
img = WordCloud(background_color='white', width=800, height=400).generate_from_frequencies(data)
plt.figure(figsize=(8, 4))
plt.imshow(img)
plt.axis('off')
plt.show()

Given that these bigrams seem to be single letters, mostly nonsensical, it may be worth adding a condition to the cleaning function that eliminates words that are only single letters (although it's worth noting that this could eliminate things like "I" or "a."

### Scala:

In [None]:
scala_df = train[train.language_group == 'Scala']

In [None]:
scala_text = ''
for i in range(scala_df.shape[0]):
    scala_text = scala_text + scala_df.cleaned.iloc[i]
    
scala_text[0:200]

In [None]:
img = WordCloud(background_color='white').generate(scala_text)
# WordCloud() produces an image object, which can be displayed with plt.imshow
plt.imshow(img)
# axis aren't very useful for a word cloud
plt.axis('off');

In [None]:
scala_bigrams = nltk.ngrams(scala_text.split(), 2)
list(scala_bigrams)

In [None]:
top_20_scala_bigrams = (pd.Series(nltk.ngrams(scala_text.split(), 2))
                      .value_counts()
                      .head(20))

top_20_scala_bigrams.head()

In [None]:
data = {k[0] + ' ' + k[1]: v for k, v in top_20_scala_bigrams.to_dict().items()}
img = WordCloud(background_color='white', width=800, height=400).generate_from_frequencies(data)
plt.figure(figsize=(8, 4))
plt.imshow(img)
plt.axis('off')
plt.show()

## JavaScript:

In [None]:
js_df = train[train.language_group == 'JavaScript']

In [None]:
js_text = ''
for i in range(js_df.shape[0]):
    js_text = js_text + js_df.cleaned.iloc[i]
    
js_text[0:200]

In [None]:
img = WordCloud(background_color='white').generate(js_text)
# WordCloud() produces an image object, which can be displayed with plt.imshow
plt.imshow(img)
# axis aren't very useful for a word cloud
plt.axis('off');

In [None]:
js_bigrams = nltk.ngrams(js_text.split(), 2)
list(js_bigrams)

In [None]:
top_20_js_bigrams = (pd.Series(nltk.ngrams(js_text.split(), 2))
                      .value_counts()
                      .head(20))

top_20_js_bigrams.head()

In [None]:
data = {k[0] + ' ' + k[1]: v for k, v in top_20_js_bigrams.to_dict().items()}
img = WordCloud(background_color='white', width=800, height=400).generate_from_frequencies(data)
plt.figure(figsize=(8, 4))
plt.imshow(img)
plt.axis('off')
plt.show()

## Other:

In [None]:
other_df = train[train.language_group == 'Other']

In [None]:
other_text = ''
for i in range(other_df.shape[0]):
    other_text = other_text + other_df.cleaned.iloc[i]
    
other_text[0:200]

In [None]:
img = WordCloud(background_color='white').generate(other_text)
# WordCloud() produces an image object, which can be displayed with plt.imshow
plt.imshow(img)
# axis aren't very useful for a word cloud
plt.axis('off');

In [None]:
other_bigrams = nltk.ngrams(other_text.split(), 2)
list(other_bigrams)

In [None]:
top_20_other_bigrams = (pd.Series(nltk.ngrams(other_text.split(), 2))
                      .value_counts()
                      .head(20))

top_20_other_bigrams.head()

In [None]:
data = {k[0] + ' ' + k[1]: v for k, v in top_20_other_bigrams.to_dict().items()}
img = WordCloud(background_color='white', width=800, height=400).generate_from_frequencies(data)
plt.figure(figsize=(8, 4))
plt.imshow(img)
plt.axis('off')
plt.show()

# Prepping for modeling:

## Creating three models to pick the best:

In [None]:
df.head()

### Logistic Regression:

In [None]:
X = tfidf.fit_transform(df.cleaned)
y = df.language_group

train_validate, X_test, y_train_validate, y_test = train_test_split(X, y, test_size=.2, stratify = y)
X_train, X_validate, y_train, y_validate = train_test_split(train_validate, y_train_validate, test_size=.2, stratify = y_train_validate)

train = pd.DataFrame(dict(actual=y_train))
validate = pd.DataFrame(dict(actual=y_validate))

lm = LogisticRegression().fit(X_train, y_train)

train['predicted'] = lm.predict(X_train)
validate['predicted'] = lm.predict(X_validate)

In [None]:
print('Accuracy: {:.2%}'.format(accuracy_score(train.actual, train.predicted)))
print('---')
print('Confusion Matrix')
print(pd.crosstab(train.predicted, train.actual))
print('---')
print(classification_report(train.actual, train.predicted))

In [None]:
print('Accuracy: {:.2%}'.format(accuracy_score(validate.actual, validate.predicted)))
print('---')
print('Confusion Matrix')
print(pd.crosstab(validate.predicted, validate.actual))
print('---')
print(classification_report(validate.actual, validate.predicted))

# <span style = 'color:green'>KNN: </span>

In [None]:
y_validate.value_counts()

In [None]:
X = tfidf.fit_transform(df.cleaned)
y = df.language_group

train_validate, X_test, y_train_validate, y_test = train_test_split(X, y, test_size=.2, stratify = y)
X_train, X_validate, y_train, y_validate = train_test_split(train_validate, y_train_validate, test_size=.2, stratify = y_train_validate)

train = pd.DataFrame(dict(actual=y_train))
validate = pd.DataFrame(dict(actual=y_validate))

knn = KNeighborsClassifier(n_neighbors = 19, weights='uniform')
knn.fit(X_train, y_train)

train['knn_predicted'] = knn.predict(X_train)
validate['knn_predicted'] = knn.predict(X_validate)

print('Accuracy of KNN classifier on training set: {:.2f}'
     .format(knn.score(X_train, y_train)))

In [None]:
print('Accuracy of KNN classifier on training set: {:.2f}'
     .format(knn.score(X_validate, y_validate)))

In [None]:
print('Accuracy of KNN classifier on test set: {:.2f}'
     .format(knn.score(X_test, y_test)))

# Decision Tree:

In [None]:
X = tfidf.fit_transform(df.cleaned)
y = df.language_group

train_validate, X_test, y_train_validate, y_test = train_test_split(X, y, test_size=.2, stratify = y)
X_train, X_validate, y_train, y_validate = train_test_split(train_validate, y_train_validate, test_size=.2, stratify = y_train_validate)

train = pd.DataFrame(dict(actual=y_train))
validate = pd.DataFrame(dict(actual=y_validate))

clf = DecisionTreeClassifier(max_depth=17, random_state=123)
clf = clf.fit(X_train, y_train)

train['clf_predicted'] = clf.predict(X_train)
validate['clf_predicted'] = clf.predict(X_validate)

In [None]:
#This Decision Tree shows an ~88 percent accuracy on the train set
print(classification_report(y_train, train.clf_predicted))

In [None]:
print(classification_report(y_validate, validate.clf_predicted))