# PLAN

- [ ] Acquisition
    - [ ] Select what list of repos to scrape.
    - [ ] Get requests form the site.
    - [ ] Save responses to csv.
- [ ] Preparation
    - [ ] Prepare the data for analysis.
- [ ] Exploration
    - [ ] Answer the following prompts:
        - [ ] What are the most common words in READMEs?
        - [ ] What does the distribution of IDFs look like for the most common words?
        - [ ] Does the length of the README vary by language?
        - [ ] Do different languages use a different number of unique words?
- [ ] Modeling
    - [ ] Transform the data for machine learning; use language to predict.
    - [ ] Fit several models using different text repressentations.
    - [ ] Build a function that will take in the text of a README file, and makes a prediction of language.
- [ ] Delivery
    - [ ] Github repo
        - [x] This notebook.
        - [ ] Documentation within the notebook.
        - [ ] README file in the repo.
        - [ ] Python scripts if applicable.
    - [ ] Google Slides
        - [ ] 1-2 slides only summarizing analysis.
        - [ ] Visualizations are labeled.
        - [ ] Geared for the general audience.
        - [ ] Share link @ readme file and/or classroom.

# ENVIRONMENT

In [1]:
# disable warnings
import warnings
warnings.filterwarnings("ignore")

import unicodedata
import re
from requests import get
import json
# import spacy
import nltk
from nltk.tokenize.toktok import ToktokTokenizer
from nltk.corpus import stopwords
from bs4 import BeautifulSoup
import pandas as pd
import time
import csv
from functools import reduce

BASEURL = 'https://github.com/search?p=1&q=stars%3A%3E0&s=stars&type=Repositories'
HEADERS = {'User-Agent': 'Definitely not Sentient Attack Helicoptor'}

# ACQUIRE

First thing that needs to happen is to get the links from the most starred github repositories.

In [2]:
def get_url_list(page):
    urls = []
    response = get(BASEURL, headers=HEADERS)
    soup = BeautifulSoup(response.content)
    max_page = page + 1
    for i in range(1,max_page):
        url = 'https://github.com/search?p=' + str(i) + '&q=stars%3A%3E0&s=stars&type=Repositories'
        print(f'traversing url: {url}')
        response = get(url, headers=HEADERS)
        soup = BeautifulSoup(response.text)
        list_of_repos = soup.find('ul', class_='repo-list')
        repository = list_of_repos.find_all('li', class_='repo-list-item')
        for h in repository:
            if h.find(attrs={'itemprop':'programmingLanguage'}):
                a = h.find('a')
                urls.append(a.attrs['href'])
        time.sleep(3)
    print(f'Scraped a total of {len(urls)} github urls.')
    urls = ['https://github.com' + url for url in urls]
    with open('github_urls.csv', 'w') as f:
        ghub_urls = csv.writer(f, delimiter=',')
        ghub_urls.writerow(urls)
    return urls


##### Function that grabs the readme text and the main language of the repo


In [3]:
def grab_readmes_and_languages(urls):
    readmes = []
    languages = []
    for url in urls:
        response = get(url)
        soup = BeautifulSoup(response.content, 'html.parser')
        # print('Retrieving README')
        if soup.find('div', class_='Box-body') == None:
            # print('Skipping because of no README')
            continue
        else:
            single_readme = soup.find('div', class_='Box-body').text
            # print('Got README')
        # print('Retrieving language')
        if soup.find('span', class_='lang') == None:
            # print('Skipping because of no language')
            continue
        else:
            repo_language = soup.find('span', class_='lang').text
            # print('Got language')
        languages.append(repo_language)
        readmes.append(single_readme)
    df = pd.DataFrame({'readme':readmes, 'language':languages})
    
    return df


##### Cleaning functions.

In [4]:
def basic_clean(string):
    """Will lowercase, normalize, and remove anything that isn't a letter, number,
    whitespace or single quote and return it."""
    clean_string = string.lower()
    clean_string = unicodedata.normalize('NFKD', clean_string).\
                    encode('ascii', 'ignore').\
                    decode('utf-8', 'ignore')
    clean_string = re.sub(r'[^a-z0-9\s]', '', clean_string)
    clean_string = clean_string.strip()
    clean_string = re.sub(r'\s+', ' ', clean_string)
    return clean_string



def tokenize(string, string_or_list='string'):
    """nltk.tokenize.ToktokTokenizer"""
    tokenizer = nltk.tokenize.ToktokTokenizer()
    if string_or_list == 'string':
        return tokenizer.tokenize(string, return_str=True)
    if string_or_list == 'list':
        return tokenizer.tokenize(string)
    
def stem(string, string_or_list='string'):
    """Returns the stems."""
    ps = nltk.porter.PorterStemmer()
    stems = [ps.stem(word) for word in string.split()]
    stemmed_string = ' '.join(stems)
    if string_or_list == 'list':
        return stems
    if string_or_list == 'string':
        return stemmed_string
    
def lemmatize(string, string_or_list='string'):
    """Returns the lemmatized text."""
    wnl = nltk.stem.WordNetLemmatizer()
    lemmas = [wnl.lemmatize(word) for word in string.split()]
    lemmatized_string = ' '.join(lemmas)
    if string_or_list == 'string':
        return lemmatized_string
    if string_or_list == 'list':
        return lemmas
    
def remove_stopwords(string, string_or_list='string', extra_words=None, exclude_words=None):
    """Removes the stopwords from the text then returns it. Able to add or remove stopwords."""
    stopword_list = stopwords.words('english')
    if extra_words != None:
        for word in extra_words:
            stopword_list.append(word)
    if exclude_words != None:
        for word in exclude_words:
            stopword_list.remove(word)
    filtered_words = [word for word in string.split() if word not in stopword_list]
    filtered_string = ' '.join(filtered_words)
    if string_or_list == 'string':
        return filtered_string
    if string_or_list == 'list':
        return filtered_words
    
def pipe(v, *fns):
    return reduce(lambda x, f: f(x), fns, v)

# Master function for cleaning

def readme_lem(text):
    return pipe(text, basic_clean, tokenize, remove_stopwords, lemmatize)

def readme_stem(text):
    return pipe(text, basic_clean, tokenize, remove_stopwords, stem)

In [5]:
# urls = get_url_list(10)

In [6]:
with open('github_urls.csv') as f:
    urls = f.readlines()
urls = urls[0].split(',')

In [7]:
len(urls)

7

In [8]:
df = grab_readmes_and_languages(urls)
df.head(10)

Unnamed: 0,readme,language
0,\n\n\n\n\n\nWelcome to freeCodeCamp.org's open...,JavaScript
1,\n996.ICU\nPlease note that there exists NO ot...,Rust
2,\n\n\n\n\n\n\n\n\n\n\n\nSupporting Vue.js\nVue...,JavaScript
3,"\n\n\n\n\n\nBootstrap\n\n Sleek, intuitive, a...",JavaScript
4,\nReact · \nReact is a JavaScript library f...,JavaScript
5,\n\n\n\n\n\n\n\nDocumentation\n\n\n\n\n\n\n\n\...,C++


# PREPARE

In [9]:
df['lemmatized'] = df.readme.apply(readme_lem)

In [10]:
df['stemmed'] = df.readme.apply(readme_stem)

In [11]:
df.head()

Unnamed: 0,readme,language,lemmatized,stemmed
0,\n\n\n\n\n\nWelcome to freeCodeCamp.org's open...,JavaScript,welcome freecodecamporgs open source codebase ...,welcom freecodecamporg open sourc codebas curr...
1,\n996.ICU\nPlease note that there exists NO ot...,Rust,996icu please note exists official account app...,996icu pleas note exist offici account app mer...
2,\n\n\n\n\n\n\n\n\n\n\n\nSupporting Vue.js\nVue...,JavaScript,supporting vuejs vuejs mitlicensed open source...,support vuej vuej mitlicens open sourc project...
3,"\n\n\n\n\n\nBootstrap\n\n Sleek, intuitive, a...",JavaScript,bootstrap sleek intuitive powerful frontend fr...,bootstrap sleek intuit power frontend framewor...
4,\nReact · \nReact is a JavaScript library f...,JavaScript,react react javascript library building user i...,react react javascript librari build user inte...


In [12]:
pd.Series(' '.join(df[df.language == 'Rust'].lemmatized).split()).value_counts()

996             13
license         11
list             8
work             7
company          7
996icu           7
please           5
working          5
open             5
worker           5
source           5
employee         4
schedule         4
hour             4
add              4
github           4
go               4
right            4
project          3
founder          3
labor            3
icu              3
chinese          3
never            3
see              3
refers           2
another          2
overtime         2
mean             2
progress         2
                ..
least            1
feeling          1
back             1
version          1
control          1
petition         1
channel          1
60               1
voice            1
youzan           1
am9              1
hear             1
conduct          1
unit             1
alibaba          1
anti             1
financial        1
uphold           1
vanished         1
give             1
corp             1
without     

# EXPLORE

# MODEL