# PLAN

- [ ] Acquisition
    - [ ] Select what list of repos to scrape.
    - [ ] Get requests form the site.
    - [ ] Save responses to csv.
- [ ] Preparation
    - [ ] Prepare the data for analysis.
- [ ] Exploration
    - [ ] Answer the following prompts:
        - [ ] What are the most common words in READMEs?
        - [ ] What does the distribution of IDFs look like for the most common words?
        - [ ] Does the length of the README vary by language?
        - [ ] Do different languages use a different number of unique words?
- [ ] Modeling
    - [ ] Transform the data for machine learning; use language to predict.
    - [ ] Fit several models using different text repressentations.
    - [ ] Build a function that will take in the text of a README file, and makes a prediction of language.
- [ ] Delivery
    - [ ] Github repo
        - [x] This notebook.
        - [ ] Documentation within the notebook.
        - [ ] README file in the repo.
        - [ ] Python scripts if applicable.
    - [ ] Google Slides
        - [ ] 1-2 slides only summarizing analysis.
        - [ ] Visualizations are labeled.
        - [ ] Geared for the general audience.
        - [ ] Share link @ readme file and/or classroom.

# ENVIRONMENT

In [46]:
# disable warnings
import warnings
warnings.filterwarnings("ignore")

import unicodedata
import re
import os
from requests import get
import json
import spacy
import nltk
from nltk.tokenize.toktok import ToktokTokenizer
from nltk.corpus import stopwords
from bs4 import BeautifulSoup
from urllib.parse import urlparse

BASEURL = 'https://github.com/search?p=1&q=stars%3A%3E0&s=stars&type=Repositories'
HEADERS = {'User-Agent': 'Sentient Attack Helicoptor'}

# ACQUIRE

First thing that needs to happen is to get the links from the most starred github repositories.

In [101]:
def get_url_list(page):
    urls = []
    response = get(BASEURL, headers=HEADERS)
    soup = BeautifulSoup(response.content)
    max_page = page + 1
    for i in range(1,max_page):
        url = 'https://github.com/search?p=' + str(i) + '&q=stars%3A%3E0&s=stars&type=Repositories'
        print(f'traversing url: {url}')
        response = get(url, headers=HEADERS)
        soup = BeautifulSoup(response.text)
        list_of_repos = soup.find('ul', class_='repo-list')
        repository = list_of_repos.find_all('li', class_='repo-list-item')
        for h in repository:
            if h.find(attrs={'itemprop':'programmingLanguage'}):
                a = h.find('a')
                urls.append(a.attrs['href'])
            else:
                print(f'skipping because no programming language')
    print(f'Scrapedf a total of {len(urls)} github urls.')
    urls = ['https://github.com' + url for url in urls]
    return urls


In [100]:
get_url_list(1)

traversing url: https://github.com/search?p=1&q=stars%3A%3E0&s=stars&type=Repositories
skipping because no programming language
skipping because no programming language
skipping because no programming language
7


['https://github.com/freeCodeCamp/freeCodeCamp',
 'https://github.com/996icu/996.ICU',
 'https://github.com/vuejs/vue',
 'https://github.com/twbs/bootstrap',
 'https://github.com/facebook/react',
 'https://github.com/tensorflow/tensorflow',
 'https://github.com/robbyrussell/oh-my-zsh']

In [61]:
#construct a url list

def get_url_list2(page):
    urls = []
    response = get(BASEURL, headers=HEADERS)
    soup = BeautifulSoup(response.content)
    max_page = page + 1
    for i in range(1,max_page):
        url = 'https://github.com/search?p=' + str(i) + '&q=stars%3A%3E0&s=stars&type=Repositories'
        print(f'traversing url: {url}')
        response = get(url, headers=HEADERS)
        # parse the fetched HTML content using a HTML parser
        # since our page content is going to be in HTML format
        soup = BeautifulSoup(response.text)
        # find the repositories container div
        main_content = soup.find(class_='repo-list')
        # Extract the list of repositories
        list_of_repos = main_content.find('li', class_='repo-list-item')
        print(list_of_repos)
        # create a new list to put our extracted data
        results = []
        # Function to extract the details for each repo
        for repo in list_of_repos:
            # create a new repo’s details dictionary
            repository = {}
            base_url = repo.find('href')
            print(base_url)
            if soup.find(attrs={'itemprop':'programmingLanguage'}):
                language = soup.find(attrs={'itemprop':'programmingLanguage'})
            results.append(repository)
    return repository

In [60]:
get_url_list(1)

traversing url: https://github.com/search?p=1&q=stars%3A%3E0&s=stars&type=Repositories
<li class="repo-list-item d-flex flex-column flex-md-row flex-justify-start py-4 public source">
<div class="col-12 col-md-8 pr-md-3">
<h3>
<a class="v-align-middle" data-hydro-click='{"event_type":"search_result.click","payload":{"page_number":1,"per_page":10,"query":"stars:&gt;0","result_position":1,"click_id":177736533,"result":{"id":177736533,"global_relay_id":"MDEwOlJlcG9zaXRvcnkxNzc3MzY1MzM=","model_name":"Repository","url":"https://github.com/996icu/996.ICU"},"client_id":null,"originating_request_id":"E6C9:50D9:10B6A4:1D437B:5CD4416D","originating_url":"https://github.com/search?p=1&amp;q=stars%3A%3E0&amp;s=stars&amp;type=Repositories","referrer":null,"user_id":null}}' data-hydro-click-hmac="dd37b7f474ec18267c1a1737ded8c8aa8424486d3f01c9a63ca68651c8171139" href="/996icu/996.ICU">996icu/996.ICU</a>
</h3>
<p class="col-12 col-md-9 d-inline-block text-gray mb-2 pr-4">
        Repo for counting st

{}

# PREPARE

# EXPLORE

# MODEL