In [1]:
#import tools for web scraping:
from requests import get
from bs4 import BeautifulSoup
import os
import pandas as pd

#group imports
import env
import acquire
import constants_prepare

import json
from typing import Dict, List, Optional, Union, cast
import requests

#visualizations:
import matplotlib.pyplot as plt
import seaborn as sns

_____________________________________________________________________________

## Exploring Ubuntu:

In [None]:
ubuntu = pd.read_json('ubuntu_data.json')

In [None]:
ubuntu.head()

#### Normalizing the data:

In [None]:
import unicodedata
import re

import nltk
from nltk.tokenize.toktok import ToktokTokenizer
from nltk.corpus import stopwords

In [None]:
#to view just one of the readme docs and see what kind of normalizing is needed:
soup = BeautifulSoup(ubuntu.readme_contents.iloc[10], 'html.parser')
soup

In [None]:
#getting an overall view of the data:
ubuntu.info()

In [None]:
#looking at any nulls:
ubuntu.isnull().sum()

In [None]:
#dropping nulls (we found that these repos have no languages attached)
ubuntu = ubuntu.dropna()

In [None]:
#df w/o nulls:
ubuntu.isnull().sum()

In [None]:
#seeing the unique languages:
ubuntu.language.value_counts()

______________________________________________

## Ubuntu Languages:

In [None]:
#looking at the overall percentages of the top 10 languages used in Ubuntu repos:
language_counts=ubuntu.language.value_counts(normalize=True).head(10)
language_counts

In [None]:
#creating a df of the percentages to prep for charts:
temp = pd.DataFrame({'language' : language_counts.index, 'percentage': language_counts.values})
temp

In [None]:
#plotting out the percentages of the Top 10 languages used in Ubuntu repos:
plt.figure(figsize=(10,8))
sns.barplot(data=temp, x = 'language', y = 'percentage')

## Cleaning the Data:

Steps to parsing data:
- 1) Convert text to all lower case for normalcy.
- 2) Remove any accented characters, non-ASCII characters.
- 3) Remove special characters and html and linux related words.
- 4) Stem or lemmatize the words.(stem = "if b, then c")
- 5) Remove stopwords.(if, and, the, etc)
- 6) Store the clean text and the original text for use in future notebooks.

In [None]:
ADDITIONAL_STOPWORDS = [
    'sudo',
    'distro',
    'linux',
    'aptget',
    'ubuntu',
    'debian',
    'arch',
    'archlinux',
    'git',
    'root',
    'image',
    'install',
    'installed',
    'installing',
    'use', 
    'user',
    'used',
    'using',
    'server',
    'kernel',
    'update',
    'package', 
    'file',
    'run', 
    'system', 
    'configure',
    'configured',
    'command',
    'script', 
    'set', 
    'build',
    'need',
    'make',
    'option',
    'contain',
    'contained',
    1,
    'server',
    'update',
    'updated',
    'kerneled',
    'version',
    '1',
    'download',
    'create', 
    'default'
    
] 

def clean_data(text):
    ps = nltk.porter.PorterStemmer()
    stopwords = nltk.corpus.stopwords.words('english') + ADDITIONAL_STOPWORDS
    text = (unicodedata.normalize('NFKD', text)
             .encode('ascii', 'ignore')
             .decode('utf-8', 'ignore')
             .lower())
    words = re.sub(r'[\(<\"]?http.*[\)>\"\s]', ' ', text).split()
    words = [re.sub(r'[^\w\s]', '', text) for text in words]
    try:
        while True:
            words.remove('')
    except ValueError:
        pass
    
    return ' '.join([ps.stem(word) for word in words if word not in stopwords])

In [None]:
#grabbing text from df:
ubuntu_corpus = ' '.join(ubuntu['readme_contents'])
ubuntu_corpus[:100]

In [None]:
#creating a list 
ubuntu_corpus_list = clean_data(ubuntu_corpus).split()
ubuntu_corpus_list

In [None]:
u_count = pd.Series(ubuntu_corpus_list)

In [None]:
u_count.value_counts().nlargest(20)

In [None]:
#adding the clean function through the readme content and adding 
#onto a new column in the df:
ubuntu['clean_readme'] = ubuntu.readme_contents.apply(clean_data)
ubuntu

______________________________

## Creating a df for the top 10 languages:

In [None]:
#finding all shell readmes:
shell_readme =' '.join(ubuntu[ubuntu['language'] == 'Shell'].clean_readme)

In [None]:
ubuntu[ubuntu['language'] == 'Shell'].clean_readme

In [None]:
#renaming to shell words and cleaning data:
shell_words=clean_data(shell_readme)

In [None]:
#finding all python readmes:
python_readme =' '.join(ubuntu[ubuntu['language'] == 'Python'].clean_readme)

In [None]:
#renaming to python words and cleaning data:
python_words=clean_data(python_readme)

In [None]:
#finding all dockerfile readmes:
dockerfile_readme =' '.join(ubuntu[ubuntu['language'] == 'Dockerfile'].clean_readme)

In [None]:
#renaming to dockerfile words and cleaning data:
dockerfile_words=clean_data(dockerfile_readme)

In [None]:
#finding all C readmes:
C_readme =' '.join(ubuntu[ubuntu['language'] == 'C'].clean_readme)

In [None]:
#renaming to C words and cleaning data:
C_words=clean_data(C_readme)

In [None]:
#finding all C ++ readmes:
Cplus_readme =' '.join(ubuntu[ubuntu['language'] == 'C++'].clean_readme)

In [None]:
#renaming to C++ words and cleaning data:
Cplus_words=clean_data(Cplus_readme)

### Finding word frequencies per language:

In [None]:
#creating series with frequencies by putting the clean_readme strings
#as series (but use split to split up words into a list)

shell_freq = pd.Series(shell_words.split()).value_counts()
python_freq = pd.Series(python_words.split()).value_counshets()
dockerfile_freq = pd.Series(dockerfile_words.split()).value_counts()
C_freq = pd.Series(C_words.split()).value_counts()
Cplus_freq = pd.Series(Cplus_words.split()).value_counts()

In [None]:
#looking at how we need to split:
pd.Series(shell_words.split()).value_counts()

In [None]:
#combining frequencies into dataframes:
word_counts = (pd.concat([shell_freq, python_freq, dockerfile_freq, C_freq, Cplus_freq], axis=1, sort=True)
              .set_axis(['shell','python','dockerfile','C','Cplus'], axis=1, inplace=False)
              .fillna(0)
              .apply(lambda s: s.astype(int)))

In [None]:
word_counts.head()

### What are the most occuring words across these languages?

In [None]:
word_counts.sort_values(by='shell', ascending=False).head(10)

___________________________________

## How long is a typical Ubuntu Readme (after clean)?

In [None]:
#creating a column that gives the lenght of the readme_contents after clean_data function
ubuntu['length_of_readme'] = ubuntu['readme_contents'].apply(lambda r : len(clean_data(r)))

In [None]:
#plotting this to show any anomalies:
plt.figure(figsize = (10,10))
sns.boxplot(data = ubuntu.length_of_readme)

In [None]:
#finding the mean, min and max of Ubuntu readmes:
ubuntu.length_of_readme.describe()

In [None]:
#longest readme info:
ubuntu[ubuntu['length_of_readme'] == 147543]

In [None]:
#minimum readmes:
ubuntu[ubuntu['length_of_readme'] == 0]

In [None]:
ubuntu['cleaned_length'] = 0

In [None]:
plt.figure(figsize=(18, 8))
sns.barplot(data = ubuntu.groupby('language').mean().reset_index().sort_values('length_of_readme', ascending=False), x = 'length_of_readme', y='language')
plt.title('Average Readme length by Language')
plt.show()

_____________________________________________

## Number of unique words in Ubuntu Repos:

In [None]:
ubuntu_corpus_series = pd.Series(ubuntu_corpus_list)
pd.Series(ubuntu_corpus_list).nunique()

___________________________________________

## Most common bigrams of Ubuntu:

In [None]:
ubuntu_bigrams = (pd.Series(nltk.ngrams(ubuntu_corpus_series, 2)).value_counts().head(20))

In [None]:
ubuntu_bigrams

## Most common trigrams of Ubuntu:

In [None]:
ubuntu_trigrams = (pd.Series(nltk.ngrams(ubuntu_corpus_series, 3)).value_counts().head(20))
ubuntu_trigrams

In [None]:
from wordcloud import WordCloud

img = WordCloud(background_color='white').generate(ubuntu_bigrams)
# WordCloud() produces an image object, which can be displayed with plt.imshow
plt.imshow(img)
# axis aren't very useful for a word cloud
plt.axis('off')


________________________________________________

## Reading in the master list csv:

In [None]:
master_df = pd.read_csv('master_list.csv')

In [None]:
master_df

### Exploring the data:

In [None]:
master_df.info()

In [None]:
master_df.info

In [None]:
master_df.describe()

In [None]:
#any nulls?
master_df.isnull().sum()

In [None]:
#dropping nulls:
master_df = master_df.dropna()

In [None]:
#just checking if worked:
master_df.isnull().sum()

## Looking at top languages across master_list:

In [None]:
#looking at the overall percentages of the top 10 languages used in all repos:
master_language_counts=master_df.language.value_counts(normalize=True).head(10)
master_language_counts

In [None]:
#creating a df of the percentages to prep for charts:
temp = pd.DataFrame({'language' : master_language_counts.index, 'percentage': master_language_counts.values})
temp

In [None]:
#plotting out the percentages of the Top 10 languages used in Ubuntu repos:
plt.figure(figsize=(10,8))
sns.barplot(data=temp, x = 'language', y = 'percentage')

## Finding common words overall for stop_words:

In [None]:
#grabbing text from df:
master_corpus = ' '.join(master_df['readme_contents'])
master_corpus[:100]

In [None]:
#first, lower all letters:
master_corpus = master_corpus.lower()

In [None]:
#next, normalize by removing special characters:
import unicodedata

master_corpus = unicodedata.normalize('NFKD', master_corpus)\
    .encode('ascii', 'ignore')\
    .decode('utf-8', 'ignore')

In [None]:
#taking out special characters:
# remove anything that is not a through z, a number, a single quote, or whitespace
master_corpus = re.sub(r"[^a-z0-9'\s]", '', master_corpus)

In [None]:
#tokenizing:
import nltk
tokenizer = nltk.tokenize.ToktokTokenizer()

print(tokenizer.tokenize(master_corpus, return_str=True)[0:500])


In [None]:
#removing main stopwords:
from nltk.corpus import stopwords
stopword_list = stopwords.words('english')

stopword_list.remove('no')
stopword_list.remove('not')

stopword_list[:10]

In [None]:
#creating loop for this:
words = master_corpus.split()
filtered_words = [w for w in words if w not in stopword_list]

print('Removed {} stopwords'.format(len(words) - len(filtered_words)))
print('---')

corpus_without_stopwords = ' '.join(filtered_words)

print(corpus_without_stopwords)


In [None]:
#creating a list 
master_corpus_list = corpus_without_stopwords.split()
master_corpus_list

### These should be the top 40 words we include in stopwords for our clean/prep:

In [None]:
main_count = pd.Series(master_corpus_list)
main_count.value_counts().nlargest(60)

In [None]:

ADDITIONAL_STOPWORDS=[
    'sudo', 'use', 'not', 'run',
    'file', 'linux', 3 ,'3', 1 ,'1', 'script', 'ubuntu', 
    'default', 'build', 'system', 'using', 
    'docker', 'make', 'kernel',
    'version','packages', 'package','image',
    'debian', 'server', 'set', 'files',6, 
    'arch', 'configuration', 'installation',
    'user', 'also', 'update', 'see', 'used',
    'need', 2, '2', '4', '0', 'git', 'command', 'add',
    'following', 'directory', 'no', 'new',
    'want', 'create', 'installed', 'e', 'name', 
    'support', 'root', 'running', 'one', 'p',
    'like', 'environment', 'example', 'repository',
    'source', 'archlinux','ubuntu','debian','arch'
]

In [None]:
#applying clean and stemming
def clean_data(text):
    ps = nltk.porter.PorterStemmer()
    stopwords = nltk.corpus.stopwords.words('english') + ADDITIONAL_STOPWORDS
    text = (unicodedata.normalize('NFKD', text)
             .encode('ascii', 'ignore')
             .decode('utf-8', 'ignore')
             .lower())
    words = re.sub(r'[\(<\"]?http.*[\)>\"\s]', ' ', text).split()
    words = [re.sub(r'[^\w\s]', '', text) for text in words]
    try:
        while True:
            words.remove('')
    except ValueError:
        pass
    
    return [word for word in words if word not in stopwords]

In [None]:
#creating a new list of top mentioned words:
main_list = clean_data(master_corpus)
main_list

In [None]:
#creating new series
main_list_count = pd.Series(main_list)

### Top words of Full corpus, after clean:

In [None]:
#creating new top words list:
main_list_count.value_counts().nlargest(50)

In [None]:
#finding all shell readmes:
shell_readme =' '.join(master_df[master_df['language'] == 'Shell'].clean_readme)

In [None]:
#renaming to shell words and cleaning data:
shell_words=clean_data(shell_readme)

In [None]:
#finding all python readmes:
python_readme =' '.join(master_df[master_df['language'] == 'Python'].clean_readme)

In [None]:
#renaming to python words and cleaning data:
python_words=clean_data(python_readme)

In [None]:
#finding all C readmes:
C_readme =' '.join(master_df[master_df['language'] == 'C'].clean_readme)

In [None]:
#renaming to C words and cleaning data:
C_words=clean_data(C_readme)

In [None]:
#finding all dockerfile readmes:
dockerfile_readme =' '.join(master_df[master_df['language'] == 'Dockerfile'].clean_readme)


In [None]:
#renaming to dockerfile words and cleaning data:
dockerfile_words=clean_data(dockerfile_readme)

In [None]:
#creating series with frequencies by putting the clean_readme strings
#as series (but use split to split up words into a list)

shell_freq = pd.Series(shell_words.split()).value_counts()
python_freq = pd.Series(python_words.split()).value_counts()
dockerfile_freq = pd.Series(dockerfile_words.split()).value_counts()
C_freq = pd.Series(C_words.split()).value_counts()

In [None]:
#combining frequencies into dataframes:
word_counts = (pd.concat([shell_freq, python_freq, C_freq, dockerfile_freq ], axis=1, sort=True)
              .set_axis(['shell','python','C','dockerfile'], axis=1, inplace=False)
              .fillna(0)
              .apply(lambda s: s.astype(int)))

In [None]:
word_counts

### What are the most occuring words across these languages?

In [None]:
word_counts.sort_values(by='shell', ascending=False).head(10)

## How long is an average ReadMe across languages?

In [None]:
#creating a column that gives the lenght of the readme_contents after clean_data function
master_df['length_of_readme'] = master_df['readme_contents'].apply(lambda r : len(clean_data(r)))

In [None]:
master_df.head()

In [None]:
plt.figure(figsize=(18, 12))
sns.barplot(data = master_df.groupby('language').mean().reset_index().sort_values('length_of_readme', ascending=False), x = 'length_of_readme', y='language')
plt.title('Average Readme length by Language')
plt.show()

In [None]:
## Number of Unique words:
main_corpus_series = pd.Series(master_corpus_list)
pd.Series(master_corpus_list).nunique()

### Most common bigrams:

In [None]:
master_bigrams = (pd.Series(nltk.ngrams(main_corpus_series, 2)).value_counts().head(20))
master_bigrams

In [None]:
##realized I forgot to rename my cleaned df...will need to go back up and find that...

__________________________________

## Trying out the new stopwords:

In [2]:
#calling in master_df:
master_df = pd.read_csv('master_list.csv')

In [3]:
master_df

Unnamed: 0,repo,language,readme_contents,distro
0,dockerfile/ubuntu,Shell,## Ubuntu Dockerfile\n\n\nThis repository cont...,ubuntu
1,boxcutter/ubuntu,Shell,# Packer templates for Ubuntu written in legac...,ubuntu
2,wszqkzqk/deepin-wine-ubuntu,C,# Deepin wine for Ubuntu and Debian\n\n## 一、项目...,ubuntu
3,fcwu/docker-ubuntu-vnc-desktop,HTML,# docker-ubuntu-vnc-desktop\n\n[![Docker Pulls...,ubuntu
4,docker-32bit/ubuntu,Shell,ubuntu\n======\n\nBuild a docker image for ubu...,ubuntu
...,...,...,...,...
3295,gmas/home-router-ansible,Ruby,# home-router-ansible\nAnsible scripts for set...,arch
3296,Voltasalt/tial,Shell,# tial\nTwitch Installs Arch Linux: Scripts\n,arch
3297,Caesim404/sikulix-git,Shell,,arch
3298,danboid/ZALARM-install,,# Installing Arch Linux ARM (ALARM) on the SHA...,arch


In [4]:
#using drop null function:
master_df = constants_prepare.drop_nulls(master_df)

In [6]:
#pulling in new columns:
master_df = constants_prepare.adding_columns(master_df)
master_df

Unnamed: 0,repo,language,readme_contents,distro,clean_readme,length_of_readme
0,dockerfile/ubuntu,Shell,## Ubuntu Dockerfile\n\n\nThis repository cont...,ubuntu,"[ubuntu, dockerfil, repositori, contain, docke...",31
1,boxcutter/ubuntu,Shell,# Packer templates for Ubuntu written in legac...,ubuntu,"[packer, templat, ubuntu, written, legaci, jso...",501
2,wszqkzqk/deepin-wine-ubuntu,C,# Deepin wine for Ubuntu and Debian\n\n## 一、项目...,ubuntu,"[deepin, wine, ubuntu, debian, deepinwin, ubun...",164
3,fcwu/docker-ubuntu-vnc-desktop,HTML,# docker-ubuntu-vnc-desktop\n\n[![Docker Pulls...,ubuntu,"[dockerubuntuvncdesktop, docker, pull, docker,...",386
4,docker-32bit/ubuntu,Shell,ubuntu\n======\n\nBuild a docker image for ubu...,ubuntu,"[ubuntu, build, docker, imag, ubuntu, i386, ru...",14
...,...,...,...,...,...,...
3293,thatch45/varch,Python,==Why Do We Need Varch?==\nThe use of virtuali...,arch,"[need, use, rapidli, expand, linux, world, man...",365
3294,archclassroom/archclassroom.github.io,HTML,# archclassroom.github.io\nArch Linux Classroo...,arch,"[archclassroomgithubio, arch, linux, classroom...",5
3295,gmas/home-router-ansible,Ruby,# home-router-ansible\nAnsible scripts for set...,arch,"[homerouteran, ansibl, script, set, linux, rou...",18
3296,Voltasalt/tial,Shell,# tial\nTwitch Installs Arch Linux: Scripts\n,arch,"[tial, twitch, instal, arch, linux, script]",6
