In [28]:
from bs4 import BeautifulSoup
import pandas as pd
import re
import matplotlib.pyplot as plt
import seaborn as sns

import nltk
from nltk.tokenize.toktok import ToktokTokenizer
from nltk.corpus import stopwords
import unicodedata

import constants_prepare as prepare

In [29]:
df = pd.read_csv('master_list.csv')

In [30]:
df

Unnamed: 0,repo,language,readme_contents
0,dockerfile/ubuntu,Shell,## Ubuntu Dockerfile\n\n\nThis repository cont...
1,boxcutter/ubuntu,Shell,# Packer templates for Ubuntu written in legac...
2,wszqkzqk/deepin-wine-ubuntu,C,# Deepin wine for Ubuntu and Debian\n\n## 一、项目...
3,fcwu/docker-ubuntu-vnc-desktop,HTML,# docker-ubuntu-vnc-desktop\n\n[![Docker Pulls...
4,docker-32bit/ubuntu,Shell,ubuntu\n======\n\nBuild a docker image for ubu...
...,...,...,...
3295,gmas/home-router-ansible,Ruby,# home-router-ansible\nAnsible scripts for set...
3296,Voltasalt/tial,Shell,# tial\nTwitch Installs Arch Linux: Scripts\n
3297,Caesim404/sikulix-git,Shell,
3298,danboid/ZALARM-install,,# Installing Arch Linux ARM (ALARM) on the SHA...


In [31]:
def no_stem_clean_data(text):
    ps = nltk.porter.PorterStemmer()
    stopwords = nltk.corpus.stopwords.words('english') + prepare.ADDITIONAL_STOPWORDS
    text = (unicodedata.normalize('NFKD', text)
             .encode('ascii', 'ignore')
             .decode('utf-8', 'ignore')
             .lower())
    words = re.sub(r'[\(<\"]?http.*[\)>\"\s]', ' ', text).split()
    words = [re.sub(r'[^\w\s]', '', text) for text in words]
    try:
        while True:
            words.remove('')
    except ValueError:
        pass
    
    return [word for word in words if word not in stopwords]

In [32]:
df = df.dropna().reset_index().drop(columns= 'index')

In [33]:
df['cleaned_readme'] = df.readme_contents.apply(no_stem_clean_data)

In [34]:
df['cleaned_length'] = 0
for i in range(len(df.cleaned_readme)):
    df['cleaned_length'][i] = len(df.cleaned_readme[i])

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['cleaned_length'][i] = len(df.cleaned_readme[i])


In [35]:
df

Unnamed: 0,repo,language,readme_contents,cleaned_readme,cleaned_length
0,dockerfile/ubuntu,Shell,## Ubuntu Dockerfile\n\n\nThis repository cont...,"[dockerfile, repository, contains, dockerfile,...",29
1,boxcutter/ubuntu,Shell,# Packer templates for Ubuntu written in legac...,"[packer, templates, written, legacy, json, ove...",515
2,wszqkzqk/deepin-wine-ubuntu,C,# Deepin wine for Ubuntu and Debian\n\n## 一、项目...,"[deepin, wine, deepinwine, ubuntudebian, deepi...",155
3,fcwu/docker-ubuntu-vnc-desktop,HTML,# docker-ubuntu-vnc-desktop\n\n[![Docker Pulls...,"[dockerubuntuvncdesktop, docker, pulls, docker...",416
4,docker-32bit/ubuntu,Shell,ubuntu\n======\n\nBuild a docker image for ubu...,"[build, docker, image, i386, run, buildimagesh...",12
...,...,...,...,...,...
2800,thatch45/varch,Python,==Why Do We Need Varch?==\nThe use of virtuali...,"[need, varch, use, virtualization, rapidly, ex...",393
2801,archclassroom/archclassroom.github.io,HTML,# archclassroom.github.io\nArch Linux Classroo...,"[archclassroomgithubio, classroom, website]",3
2802,gmas/home-router-ansible,Ruby,# home-router-ansible\nAnsible scripts for set...,"[homerouteransible, ansible, scripts, setting,...",18
2803,Voltasalt/tial,Shell,# tial\nTwitch Installs Arch Linux: Scripts\n,"[tial, twitch, installs, scripts]",4


# Looking at value counts of all words prior to stemming

In [44]:
corpus_list = []
for entry in range(len(df.readme_contents)):
    corpus_list.extend(df.cleaned_readme[entry])
corpus = pd.Series(corpus_list)
corpus.describe()

count      971452
unique      94937
top       install
freq         8651
dtype: object

In [45]:
corpus.value_counts().sort_values(ascending=False).head(20)

install     8651
use         6226
run         5762
file        5267
script      4381
default     4105
build       4052
system      3848
using       3686
docker      3646
make        3617
kernel      3517
bash        3506
packages    3474
1           3455
version     3435
package     3384
image       3372
set         3183
server      3138
dtype: int64

In [43]:
corpus

0         dockerfile
1         repository
2           contains
3         dockerfile
4               base
             ...    
971447           url
971448          isok
971449           aur
971450         clone
971451          link
Length: 971452, dtype: object

Look at frequency of words in languages next

# Exploring Overall Data

In [36]:
df.cleaned_length.describe()

count     2805.000000
mean       346.328699
std        814.688050
min          0.000000
25%         57.000000
50%        150.000000
75%        348.000000
max      22148.000000
Name: cleaned_length, dtype: float64