In [1]:
# import libaries
import pandas as pd
import numpy as np

# import visualization tools
import matplotlib.pyplot as plt
import seaborn as sns

# import machine learning libaries
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier, export_graphviz
from sklearn.metrics import classification_report
from sklearn import tree

# import NLP tool
from wordcloud import WordCloud

# import self-created functions
import prepare
import project_acquire
import explore

# ignore warning
import warnings
warnings.filterwarnings("ignore")

# Wrangle

In [2]:
# use prepare module wrangle data function to acquire data
data=pd.read_json('data.json')

In [3]:
# check if we capture any duplicated repo/readme
data.duplicated().value_counts()

False    783
True     217
dtype: int64

### Takeaway
- The readme acquired contains foreign language, we will going to drop those languages that is not in English, due to we acquired about 1000 rows, we met the requirement of at least 100 rows.

#### Install the package needed to detect the language
- \# pip install langdetect

### Key takeaway
- The most starred README data on github was collected on October 18, 2022 due to error code found, the previous dataset was deleted and re-collected on October 18th.
- The data has 1000 rows
- We found that the readme contains foreign language, therefore, we downloaded a langdetect package for further wrangle the dataset
- we also spot some 'none' value in our language, we will do further wrangle with that data as well
- Following this acquire, we are going to prepare for our exploration

In [4]:
# clean the data acquired
# We are acquire the data that is cleaned up with tokenized, stemmed, and lemmatized
# add those columns into the dataframe and create a final data frame
df=prepare.wrangle_data(data)

In [5]:
# we will going to drop 'none' value in our language column
df=df.dropna()
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 911 entries, 2 to 999
Data columns (total 6 columns):
 #   Column           Non-Null Count  Dtype 
---  ------           --------------  ----- 
 0   repo             911 non-null    object
 1   language         911 non-null    object
 2   readme_contents  911 non-null    object
 3   stemmed          911 non-null    object
 4   lemmatized       911 non-null    object
 5   clean_tokens     911 non-null    object
dtypes: object(6)
memory usage: 49.8+ KB


In [6]:
# we will going to use the installed package 
# to filter out the readme contents that is in English only

# we are going to import a new libary for this
import langdetect as ld

# we created a new function to detect the non-english language in read me
# the function will return the result when it is not in english,, elso will not return the result
def is_en(txt):
    try:
        return ld.detect(txt)!='en'
    except:
        return False

# we applied the function we created 
nodf = df[df['readme_contents'].apply(is_en)]

In [7]:
# drop those that is not in english
df=df.drop(index=(nodf.index))
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 851 entries, 2 to 999
Data columns (total 6 columns):
 #   Column           Non-Null Count  Dtype 
---  ------           --------------  ----- 
 0   repo             851 non-null    object
 1   language         851 non-null    object
 2   readme_contents  851 non-null    object
 3   stemmed          851 non-null    object
 4   lemmatized       851 non-null    object
 5   clean_tokens     851 non-null    object
dtypes: object(6)
memory usage: 46.5+ KB


- our dataframe should contains numerical data for better exploration, therefore, we are going to created some columns that able to represent the overall struture of the dataframe. we picked df.lemmatized

In [8]:
# we use the explore module to create those numerical columns other neccessary info for exploration
df=explore.feature_engineering(df)

In [9]:
# create a final local file for easy access
df.to_csv('data.csv')

In [10]:
# checking the describe
df.describe()

Unnamed: 0,word_count_simple,word_count,unique_count,non_single_count,percent_unique,percent_repeat,percent_one_word,percent_non_single
count,851.0,851.0,851.0,851.0,851.0,851.0,851.0,851.0
mean,747.06228,726.223267,369.408931,657.478261,0.6594,0.3406,0.774203,0.909256
std,1423.812626,1392.92186,467.212157,1254.591316,0.143103,0.143103,0.085346,0.092746
min,2.0,2.0,2.0,2.0,0.231533,0.0,0.458537,0.191489
25%,185.5,182.5,130.0,167.5,0.567879,0.244433,0.716157,0.850684
50%,372.0,363.0,233.0,334.0,0.652361,0.347639,0.768041,0.908257
75%,793.5,763.5,439.0,701.0,0.755567,0.432121,0.828009,1.0
max,19158.0,18994.0,4717.0,17104.0,1.0,0.768467,1.0,1.0


### Key takeaway so far
- The numerical columns created for better exploration
- The foreign language readme columns dropped to support our exploration
- we ended up have 669 columns remained
- The describe showed that there is significant jump in those counts, we believe we need to handle the outlier in our next step

In [11]:
# create a function to trim the outlier and drop null values again
def remove_outlier(df):
    Q1 = df.quantile(0.25)
    Q3 = df.quantile(0.75)
    IQR = Q3 - Q1
    new_df = ~((df < (Q1 - 1.5 * IQR)) |(df > (Q3 + 1.5 * IQR)))
    return new_df

In [12]:
# check the data after trim the outlier
new_df=df[remove_outlier(df)]
new_df=new_df.dropna()
new_df

Unnamed: 0,repo,language,readme_contents,stemmed,lemmatized,clean_tokens,word_list,unique_words,non_single_words,word_count_simple,word_count,unique_count,non_single_count,percent_unique,percent_repeat,percent_one_word,percent_non_single
2,kamranahmedse/developer-roadmap,TypeScript,"<p align=""center"">\n <img src=""public/brand.p...",roadmapshcommun driven roadmap articl resourc ...,roadmapshcommunity driven roadmaps article res...,roadmapshcommunity driven roadmaps articles re...,"[roadmapshcommunity, driven, roadmaps, article...","[roadmapshcommunity, driven, roadmaps, article...","[driven, roadmaps, article, resource, interact...",117.0,117.0,83.0,109.0,0.709402,0.290598,0.855422,0.931624
3,twbs/bootstrap,JavaScript,"<p align=""center"">\n <a href=""https://getboot...",bootstrapsleek intuit power frontend framework...,bootstrapsleek intuitive powerful frontend fra...,bootstrapsleek intuitive powerful frontend fra...,"[bootstrapsleek, intuitive, powerful, frontend...","[bootstrapsleek, intuitive, powerful, frontend...","[intuitive, powerful, frontend, framework, fas...",522.0,515.0,353.0,418.0,0.685437,0.314563,0.790368,0.811650
8,golang/go,Go,# The Go Programming Language\n\nGo is an open...,go program languagego open sourc program langu...,go programming languagego open source programm...,go programming languagego open source programm...,"[go, programming, languagego, open, source, pr...","[go, programming, languagego, open, source, la...","[go, programming, open, source, programming, l...",95.0,95.0,78.0,81.0,0.821053,0.178947,0.846154,0.852632
9,30-seconds/30-seconds-of-code,JavaScript,[![Logo](/logo.png)](https://30secondsofcode.o...,logologopng 30 second code short javascript co...,logologopng 30 second code short javascript co...,logologopng 30 seconds code short javascript c...,"[logologopng, 30, second, code, short, javascr...","[logologopng, 30, second, code, short, javascr...","[logologopng, 30, second, code, short, javascr...",144.0,143.0,93.0,143.0,0.650350,0.349650,0.720430,1.000000
11,microsoft/terminal,C++,![terminal-logos](https://user-images.githubus...,terminallogo welcom window termin consol comma...,terminallogos welcome window terminal console ...,terminallogos welcome windows terminal console...,"[terminallogos, welcome, window, terminal, con...","[terminallogos, welcome, window, terminal, con...","[welcome, window, terminal, console, commandli...",995.0,978.0,570.0,828.0,0.582822,0.417178,0.770175,0.846626
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
991,nikic/PHP-Parser,PHP,PHP Parser\n==========\n\n[![Coverage Status](...,php parsercoverag statusthi php parser written...,php parsercoverage statusthis php parser writt...,php parsercoverage statusthis php parser writt...,"[php, parsercoverage, statusthis, php, parser,...","[php, parsercoverage, statusthis, parser, writ...","[php, parsercoverage, statusthis, php, parser,...",345.0,344.0,221.0,344.0,0.642442,0.357558,0.769231,1.000000
995,catchorg/Catch2,C++,"<a id=""top""></a>\n![Catch2 logo](data/artwork/...",catch2 logodataartworkcatch2logosmallpnggithub...,catch2 logodataartworkcatch2logosmallpnggithub...,catch2 logodataartworkcatch2logosmallpnggithub...,"[catch2, logodataartworkcatch2logosmallpnggith...","[catch2, logodataartworkcatch2logosmallpnggith...","[catch2, logodataartworkcatch2logosmallpnggith...",184.0,181.0,147.0,181.0,0.812155,0.187845,0.843537,1.000000
996,xinntao/Real-ESRGAN,Python,"<p align=""center"">\n <img src=""assets/realesr...",english demosdemosvideo updatesupd usagequicki...,english demosdemosvideos updatesupdates usageq...,english demosdemosvideos updatesupdates usageq...,"[english, demosdemosvideos, updatesupdates, us...","[english, demosdemosvideos, updatesupdates, us...","[english, model, animevideov3, model, please, ...",742.0,742.0,432.0,607.0,0.582210,0.417790,0.710648,0.818059
998,go-redis/redis,Go,# Redis client for Go\n\n[![build workflow](ht...,redi client gobuild workflowpkggodevdocumentat...,redis client gobuild workflowpkggodevdocumenta...,redis client gobuild workflowpkggodevdocumenta...,"[redis, client, gobuild, workflowpkggodevdocum...","[redis, client, gobuild, workflowpkggodevdocum...","[redis, client, gobuild, workflowpkggodevdocum...",326.0,326.0,232.0,326.0,0.711656,0.288344,0.806034,1.000000


# Overall Wrangle Takeaway
- The data aquired on October 18th, 2022 with 1000 rows
- The data contains 'none' value in the language column, we dropped all 'none' values 
- The data contains foreign language in the readme contents column, we drop all those non-english values
- We create some numerical columns for further exploration
- we drop those outliers 
- We finalized with 586 rows, and ready for exploration

# Exploration

- What are the most common words in READMEs?
- Does the length of the README vary by programming language?
- Do different programming languages use a different number of unique words?
- Are there any words that uniquely identify a programming language?

In [13]:
# set up the environment to answer the question 4
new_df.language.value_counts()

JavaScript          175
TypeScript           92
Python               77
Go                   63
C++                  59
Java                 54
HTML                 23
Rust                 22
C                    22
C#                   21
PHP                  20
Swift                11
CSS                  11
Vue                  10
Jupyter Notebook     10
Shell                10
Objective-C           9
Kotlin                8
Scala                 4
Ruby                  4
Elixir                4
CoffeeScript          3
Clojure               3
Lua                   3
Markdown              3
Objective-C++         3
Emacs Lisp            2
Batchfile             2
OCaml                 2
Vim script            2
PowerShell            2
Groovy                2
Vim Script            2
Matlab                1
SCSS                  1
V                     1
Blade                 1
Perl                  1
Vala                  1
QML                   1
Zig                   1
Haskell         

In [14]:
cv = CountVectorizer()
X = cv.fit_transform(df.lemmatized)
y = df.language

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=.2, random_state=13)
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size = .25, random_state=13)

tree = DecisionTreeClassifier(max_depth=2, random_state=123)
tree.fit(X_train, y_train)

print(f'Accuracy Score: {tree.score(X_val, y_val) * 100:.2f}%')

Accuracy Score: 28.24%


In [15]:
tfidf = TfidfVectorizer()
X = tfidf.fit_transform(df.lemmatized)
y = df.language

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=.2, random_state=13)
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size = .25, random_state=13)

tree = DecisionTreeClassifier(max_depth=17, random_state=13)
tree.fit(X_train, y_train)

print(f'Accuracy Score: {tree.score(X_val, y_val) * 100:.2f}%')

Accuracy Score: 51.18%


In [16]:
cv = CountVectorizer(ngram_range=(2, 2))
X = cv.fit_transform(df.lemmatized)
y = df.language

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=.2, random_state=13)
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size = .25, random_state=13)

tree = DecisionTreeClassifier(max_depth=26, random_state=13)
tree.fit(X_train, y_train)

print(f'Accuracy Score: {tree.score(X_val, y_val) * 100:.2f}%')

Accuracy Score: 35.88%


In [17]:
cv = CountVectorizer(ngram_range=(3, 3))
X = cv.fit_transform(df.lemmatized)
y = df.language

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=.2, random_state=13)
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size = .25, random_state=13)

tree = DecisionTreeClassifier(max_depth=25, random_state=13)
tree.fit(X_train, y_train)

print(f'Accuracy Score: {tree.score(X_val, y_val) * 100:.2f}%')

Accuracy Score: 28.24%
