In [1]:
from bs4 import BeautifulSoup
import pandas as pd
import re
import matplotlib.pyplot as plt
import seaborn as sns

import nltk
from nltk.tokenize.toktok import ToktokTokenizer
from nltk.corpus import stopwords
import unicodedata

import constants_prepare as prepare

In [2]:
df = pd.read_csv('master_list.csv')

In [3]:
df

Unnamed: 0,repo,language,readme_contents,distro
0,dockerfile/ubuntu,Shell,## Ubuntu Dockerfile\n\n\nThis repository cont...,ubuntu
1,boxcutter/ubuntu,Shell,# Packer templates for Ubuntu written in legac...,ubuntu
2,wszqkzqk/deepin-wine-ubuntu,C,# Deepin wine for Ubuntu and Debian\n\n## 一、项目...,ubuntu
3,fcwu/docker-ubuntu-vnc-desktop,HTML,# docker-ubuntu-vnc-desktop\n\n[![Docker Pulls...,ubuntu
4,docker-32bit/ubuntu,Shell,ubuntu\n======\n\nBuild a docker image for ubu...,ubuntu
...,...,...,...,...
3295,gmas/home-router-ansible,Ruby,# home-router-ansible\nAnsible scripts for set...,arch
3296,Voltasalt/tial,Shell,# tial\nTwitch Installs Arch Linux: Scripts\n,arch
3297,Caesim404/sikulix-git,Shell,,arch
3298,danboid/ZALARM-install,,# Installing Arch Linux ARM (ALARM) on the SHA...,arch


In [4]:
def no_stem_clean_data(text):
    ps = nltk.porter.PorterStemmer()
    stopwords = nltk.corpus.stopwords.words('english') + prepare.ADDITIONAL_STOPWORDS
    text = (unicodedata.normalize('NFKD', text)
             .encode('ascii', 'ignore')
             .decode('utf-8', 'ignore')
             .lower())
    words = re.sub(r'[\(<\"]?http.*[\)>\"\s]', ' ', text).split()
    words = [re.sub(r'[^\w\s]', '', text) for text in words]
    try:
        while True:
            words.remove('')
    except ValueError:
        pass
    
    return [word for word in words if word not in stopwords]

In [5]:
df = df.dropna().reset_index().drop(columns= 'index')

In [6]:
df['cleaned_readme'] = df.readme_contents.apply(no_stem_clean_data)

In [7]:
df['cleaned_length'] = 0
for i in range(len(df.cleaned_readme)):
    df['cleaned_length'][i] = len(df.cleaned_readme[i])

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['cleaned_length'][i] = len(df.cleaned_readme[i])


In [8]:
df

Unnamed: 0,repo,language,readme_contents,distro,cleaned_readme,cleaned_length
0,dockerfile/ubuntu,Shell,## Ubuntu Dockerfile\n\n\nThis repository cont...,ubuntu,"[dockerfile, repository, contains, dockerfile,...",29
1,boxcutter/ubuntu,Shell,# Packer templates for Ubuntu written in legac...,ubuntu,"[packer, templates, written, legacy, json, ove...",515
2,wszqkzqk/deepin-wine-ubuntu,C,# Deepin wine for Ubuntu and Debian\n\n## 一、项目...,ubuntu,"[deepin, wine, deepinwine, ubuntudebian, deepi...",155
3,fcwu/docker-ubuntu-vnc-desktop,HTML,# docker-ubuntu-vnc-desktop\n\n[![Docker Pulls...,ubuntu,"[dockerubuntuvncdesktop, docker, pulls, docker...",416
4,docker-32bit/ubuntu,Shell,ubuntu\n======\n\nBuild a docker image for ubu...,ubuntu,"[build, docker, image, i386, run, buildimagesh...",12
...,...,...,...,...,...,...
2800,thatch45/varch,Python,==Why Do We Need Varch?==\nThe use of virtuali...,arch,"[need, varch, use, virtualization, rapidly, ex...",393
2801,archclassroom/archclassroom.github.io,HTML,# archclassroom.github.io\nArch Linux Classroo...,arch,"[archclassroomgithubio, classroom, website]",3
2802,gmas/home-router-ansible,Ruby,# home-router-ansible\nAnsible scripts for set...,arch,"[homerouteransible, ansible, scripts, setting,...",18
2803,Voltasalt/tial,Shell,# tial\nTwitch Installs Arch Linux: Scripts\n,arch,"[tial, twitch, installs, scripts]",4


# Looking at value counts of all words prior to stemming

In [9]:
corpus_list = []
for entry in range(len(df.readme_contents)):
    corpus_list.extend(df.cleaned_readme[entry])
corpus = pd.Series(corpus_list)
corpus.describe()

count      971452
unique      94937
top       install
freq         8651
dtype: object

Look at frequency of words in languages next

# Exploring Overall Data

In [10]:
df.cleaned_length.describe()

count     2805.000000
mean       346.328699
std        814.688050
min          0.000000
25%         57.000000
50%        150.000000
75%        348.000000
max      22148.000000
Name: cleaned_length, dtype: float64

# Looking at top word percentage per language
Top 3 languages are shell, python, and c

In [11]:
top_1000 = corpus.value_counts().sort_values(ascending=False).head(5000)

In [12]:
shell = df.query('language == "Shell"').reset_index()
python = df.query('language == "Python"').reset_index()
c_lang = df.query('language == "C"').reset_index()
other_lang = df.query('language != "Shell" & language != "Python" & language != "C"').reset_index()

In [13]:
len(shell) + len(python) + len(c_lang) + len(other_lang)

2805

In [14]:
shell_corpus_list = []
for entry in range(len(shell)):
    shell_corpus_list.extend(shell.cleaned_readme[entry])
shell_corpus = pd.Series(shell_corpus_list)
shell_corpus.describe()

count      516384
unique      54972
top       install
freq         5534
dtype: object

In [15]:
python_corpus_list = []
for entry in range(len(python)):
    python_corpus_list.extend(python.cleaned_readme[entry])
python_corpus = pd.Series(python_corpus_list)
python_corpus.describe()

count      110949
unique      15950
top       install
freq          980
dtype: object

In [16]:
c_corpus_list = []
for entry in range(len(c_lang)):
    c_corpus_list.extend(c_lang.cleaned_readme[entry])
c_corpus = pd.Series(c_corpus_list)
c_corpus.describe()

count      73730
unique     11446
top       kernel
freq        1068
dtype: object

In [17]:
other_lang_corpus_list = []
for entry in range(len(other_lang)):
    other_lang_corpus_list.extend(other_lang.cleaned_readme[entry])
other_lang_corpus = pd.Series(other_lang_corpus_list)
other_lang_corpus.describe()

count      270389
unique      37566
top       install
freq         1769
dtype: object

In [18]:
top_1000.head().index

Index(['install', 'use', 'run', 'file', 'script'], dtype='object')

In [19]:
totals = []
for i in top_1000.index:
    total = 0
    for j in range(len(shell)):
        if i in shell.cleaned_readme[j]:
            total +=1
    totals.append(round(total /len(shell), 4))
    print("Shell" + i)
shell_percentages = totals


totals = []
for i in top_1000.index:
    total = 0
    for j in range(len(python)):
        if i in python.cleaned_readme[j]:
            total +=1
    totals.append(round(total /len(python), 4))
    print("Python" + i)
python_percentages = totals

totals = []
for i in top_1000.index:
    total = 0
    for j in range(len(c_lang)):
        if i in c_lang.cleaned_readme[j]:
            total +=1
    totals.append(round(total /len(c_lang), 4))
    print("C" + i)
c_percentages = totals

totals = []
for i in top_1000.index:
    total = 0
    for j in range(len(other_lang)):
        if i in other_lang.cleaned_readme[j]:
            total +=1
    totals.append(round(total /len(other_lang), 4))
    print("Others" + i)
other_lang_percentages = totals

Shellinstall
Shelluse
Shellrun
Shellfile
Shellscript
Shelldefault
Shellbuild
Shellsystem
Shellusing
Shelldocker
Shellmake
Shellkernel
Shellbash
Shellpackages
Shell1
Shellversion
Shellpackage
Shellimage
Shellset
Shellserver
Shellfiles
Shellconfiguration
Shelluser
Shellinstallation
Shell2
Shellupdate
Shellalso
Shellsee
Shellneed
Shellcommand
Shellused
Shelladd
Shellfollowing
Shelldirectory
Shellnew
Shelle
Shellwant
Shellname
Shellcreate
Shellinstalled
Shell3
Shellsupport
Shellp
Shellrunning
Shellone
Shellenvironment
Shellexample
Shellrepository
Shelllike
Shellboot
Shellcd
Shellplease
Shellmay
Shellstart
Shellde
Shellsource
Shellwork
Shelldownload
Shellavailable
Shellcontainer
Shellshell
Shellensure
Shelllist
Shelltime
Shellget
Shellapt
Shellscripts
Shellx
Shellproject
Shellnote
Shelllicense
Shellconfig
Shellssh
Shellsoftware
Shellsh
Shellfirst
Shellcode
Shellsetup
Shellchange
Shellkey
Shell4
Shellpassword
Shellclone
Shellservice
Shellvia
Shellpacman
Shelloptions
Shellcheck
Shellhost
Shel

Shellless
Shelldue
Shellgrep
Shellerrors
Shellaula
Shellnano
Shellinitial
Shellaction
Shelldeploy
Shellprocesses
Shelllet
Shellleft
Shellprompt
Shellreference
Shelllocated
Shellprevious
Shellenv
Shellque
Shellboard
Shellopenstack
Shellcontaining
Shellfast
Shellentry
Shellnodes
Shellwish
Shellcli
Shelljunest
Shellflags
Shellive
Shellexist
Shellextension
Shellssl
Shellbtrfs
Shellauthentication
Shellintel
Shellinstallsh
Shellkernels
Shellcreates
Shellvpn
Shellnotice
Shellnewer
Shellcertificate
Shellandor
Shelltab
Shellreleased
Shellagent
Shellfedora
Shelllocally
Shellaround
Shellexternal
Shellmonitor
Shellactions
Shellquestions
Shellcis
Shellwireguard
Shelluninstall
Shelltmp
Shellregular
Shellum
Shelli3
Shellready
Shellfully
Shellhub
Shelllets
Shellcall
Shellefi
Shellseparate
Shellreally
Shellconfigurations
Shellreal
Shellamd64
Shellfollows
Shellencryption
Shellcontributing
Shelltodo
Shellgcc
Shelllooking
Shellfeel
Shelleditor
Shell24
Shellem
Shellgpu
Shellstored
Shelltraining
Shellcompil

Shelljoin
Shellpoints
Shellhold
Shellplus
Shellplaced
Shellcolors
Shellse
Shellcommandline
Shellstage
Shellrede
Shelldebianubuntu
Shelllogo
Shellupgrading
Shelldisplayed
Shellmapping
Shellvirtualization
Shellthink
Shelllightweight
Shellpretty
Shellbuildessential
Shell8080
Shellmostrando
Shellsignature
Shellability
Shellpolybar
Shellaccept
Shellprovision
Shellhelps
Shelldeepin
Shellbattery
Shellseparated
Shelltarball
Shellgives
Shell27
Shelli386
Shellencrypt
Shellupon
Shellsdk
Shellpreferred
Shellbasicos
Shellprebuilt
Shellblocks
Shelltermux
Shellpkgbuilds
Shellkali
Shellimplemented
Shellfine
Shellvolumes
Shellignored
Shellami
Shelluncomment
Shellfan
Shellclose
Shellinitrd
Shellcopies
Shellnocheckcertificate
Shellreplaced
Shellwheel
Shelltemporary
Shellshown
Shelludp
Shellou
Shellnull
Shellhardening
Shelltmux
Shellacross
Shellboxes
Shellfeedback
Shellcustomization
Shellforward
Shellinfrastructure
Shellquite
Shellmaintenance
Shellgem
Shellselection
Shellbits
Shellattempt
Shellsuccessfull

Shellsubnet
Shellmostly
Shellworker
Shellrename
Shellarticle
Shellasked
Shellsleep
Shellfunctional
Shellrestricted
Shelltray
Shellwidth
Shellclassification
Shellrerun
Shellpip3
Shellsearching
Shelldigital
Shelldistribute
Shellboards
Shellvi
Shellfamily
Shelldebhelper
Shellrecipe
Shellsuffix
Shellterm
Shellmatrix
Shellwarranties
Shellremaining
Shellfi
Shellfields
Shelldeclarations
Shellkeyring
Shellimproving
Shelldocuments
Shelltimer
Shelletcsshsshd_config
Shellsmaller
Shelltoolkit
Shellsuggest
Shellhand
Shellfish
Shellbehaviour
Shellperl
Shellattached
Shellbenchmarks
Shellcommits
Shellinvoke
Shellprocessor
Shellfar
Shelldevsda1
Shellproviders
Shellconfigurar
Shelldefining
Shellanyway
Shellcapture
Shellcountry
Shell1000
Shellparsing
Shell2015
Shellsamba
Shellmongodb
Shellyubikey
Shellxml
Shellmagic
Shellun
Shellinitialize
Shellcompliance
Shellhomebrew
Shellproceed
Shellaway
Shellrespectively
Shellreinstall
Shellenhanced
Shellopened
Shelldie
Shelltelegraf
Shellnewest
Shellzshrc
Shelllos


Shellselecting
Shellreadmemd
Shell55
Shellnotify
Shelladdr
Shellbatch
Shelllaunchpad
Shell01
Shellcinder
Shellphase
Shellinteraction
Shell4096
Shellcursors
Shellcaas
Shellceilometer
Shellbanner
Shellalternate
Shellbbr
Shelluname
Shelllead
Shelltargz
Shellmarkdown
Shellproducts
Shellcable
Shellchoices
Shellscale
Shellexplain
Shelldescriptions
Shellprotect
Shellrebooting
Shellwrites
Shellgems
Shellsoc
Shellfpga
Shellfreebsd
Shellansiblegalaxy
Shelltranslations
Shellscaling
Shellobjects
Shellcompose
Shelldist
Shellap
Shellsiteyml
Shellmean
Shell0000
Shellbeautiful
Shellxps
Shelltlp
Shellsqueeze
Shellunused
Shell0600
Shellfallback
Shellmodes
Shellwwwdata
Shellaccessing
Shellflow
Shellopensuse
Shellcoming
Shellpure
Shellflexible
Shellparametros
Shellsignal
Shellinteractively
Shelllogout
Shelldest
Shellaria2
Shellunnecessary
Shellcontainer_id
Shelltabs
Shellwm
Shellgiving
Shell44
Shelldebsrc
Shellact
Shellpixel
Shellhappy
Shellabsolute
Shellmodeling
Shellgranted
Shellsufficient
Shelldeleting

Shellpathname
Shellefficiency
Shellopens
Shelltbody
Shellcreative
Shellconnectivity
Shellecdsa
Shellmib
Shellmaintaining
Shellminimize
Shell4gb
Shellhuge
Shellglance
Shellbase00
Shellqemuuserstatic
Shelllaunches
Shellbelieve
Shellvertical
Shellparallels
Shellreplacing
Shell443
Shelldeployments
Shellbundled
Shellfazer
Shellmodules_install
Shellfoi
Shellpipewire
Shelletcdefaultgrub
Shellcreator
Shellminer
Shellscanning
Shellassign
Shell10000
Shellmime
Shellprotected
Shelllack
Shellpgid0
Shellrecommends
Shellaffects
Shell2222
Shellunified
Shellcrypto
Shellremotely
Shellscheduler
Shellships
Shellplaces
Shellusrbin
Shellein
Shellslower
Shelltouchscreen
Shellaskbecomepass
Shellstars
Shellarchaudit
Shellbrowsing
Shellgtx
Shellsupplied
Shellsubdirectories
Shelllsb_release
Shellsignificant
Shelllocalmodconfig
Shellvulnerabilities
Shellelsewhere
Shellsimplest
Shellmoves
Shellpayload
Shellsourced
Shelltooling
Shelladditions
Shellumask000
Shellchild
Shelldevscripts
Shellguidance
Shellarchamd64
She

Shellubi
Shellsuggests
Shelldisplaylink
Shellai
Shellser
Shellunderlying
Shelldsc
Shelltarget_blank
Shellchecksums
Shellpuiduid
Shellstatusbar
Shellcompress
Shellphilosophy
Shelladministrative
Shellpgidgid
Shellchips
Shellslack
Shellgplv2
Shellsynced
Shelltuning
Shellfixing
Shellinfinite
Shellpowertop
Shellchoosing
Shellpacmanconf
Shellbootup
Shellroom
Shellfamiliar
Shellsf
Shelldebianchangelog
Shelluntrusted
Shell10g
Shellpruning
Shelloutbound
Shellparte02
Shellgrubinstall
Shellxournalpp
Shellorange
Shellrobotics
Shell00000000
Shelltrizen
Shellbooks
Shellsquashfstools
Shellcomfortable
Shellwd
Shellfica
Shellpreshared
Shelldropbear
Shellclamav
Shellmx
Shellpermite
Shelldetecting
Shellperiodically
Shellpeek
Shellstrange
Shelldiscovered
Shell644
Shellfurnished
Shellcriar
Shellverifying
Shellmitigate
Shellalez
Shellcomputing
Shellarg
Shellzswap
Shellopenvpns
Shellbinfmtsupport
Shell72
Shell39
Shellrunsh
Shellpackagename
Shellepoch
Shellrow
Shelllocked
Shellunsigned
Shellcontrolling
Shellw

Shelllucee
Shellstderr
Shellcalibration
Shell1080
Shelltap
Shellcollective
Shellrom
Shellinotify
Shellfuzzbunch
Shellthoroughly
Shellcookies
Shellfps
Shellfreed
Shellresizing
Shellsua
Shellbroadcom
Shellstrategy
Shellbackends
Shellpartially
Shelldpi
Shell19216801
Shelladapted
Shell2008
Shellthreat
Shellmodifies
Shellweekly
Shellvault
Shellundef
Shellurxvt
Shellaug
Shellbeware
Shelldistillation
Shellalsautils
Shellcolorscheme
Shellindexhtml
Shellusageusage
Shellsucceed
Shellquarry
Shellmy_init
Shellrpi23genimagesh
Shellremount
Shellnvim
Shelldifferentiable
Shellshadow
Shell5183
Shelllinker
Shellwanna
Shellimported
Shellcourses
Shellpresence
Shellextracts
Shellindicating
Shellexe
Shellnicht
Shell16g
Shellexcuting
Shellban
Shelleditors
Shellmanipulate
Shellsshconfig
Shellbacklight
Shellfall
Shellxxinfo
Shellovh
Shellcompat
Shellairvisual
Shellrewrite
Shelldevnvme0n1
Shellvirtmanager
Shellnotices
Shellautodetect
Shellcpio
Shelltune
Shellreducing
Shellmonths
Shellkeybindings
Shellrock
Shell

Pythonmodule
Pythoninclude
Pythoncurrently
Pythonr
Pythoninstructions
Pythonservices
Python6
Pythontext
Pythonlog
Pythonmount
Pythontesting
Pythonmany
Pythontarget
Pythoninfo
Pythonshow
Pythonsince
Pythonneeds
Pythonlook
Pythoneven
Pythontake
Pythongenerate
Pythoneither
Pythonexport
Pythondescription
Pythonmodules
Pythontheme
Pythonchroot
Pythonnetworks
Pythontwo
Pythonmysql
Pythonupdates
Pythonworks
Pythonable
Pythonphp
Pythonuseful
Pythonfix
Pythonpage
Pythonerror
Pythonincluding
Pythondistribution
Pythonpi
Pythongeneral
Pythonprovide
Pythoncard
Pythonconsole
Pythonback
Pythonstill
Pythonid
Pythonvalues
Pythonstep
Pythonfunction
Pythoninside
Pythondisplay
Pythonsteps
Pythonexecute
Pythonconnect
Pythondriver
Pythonstable
Pythonbox
Pythontested
Pythonneural
Pythonrestart
Pythoncp
Pythonyes
Pythonrequirements
Pythonselect
Pythonfollow
Pythonhardware
Pythonmanual
Pythonrm
Pythonbranch
Pythondefined
Pythonhosts
Pythonbinary
Pythonprovides
Pythonend
Pythonmenu
Pythonenter
Pythonrecommended

Pythoncases
Pythondir
Pythongraphics
Pythonwait
Pythoncompatibility
Pythonfixed
Pythonsu
Python22
Pythonruntime
Pythonbutton
Pythoncomes
Pythonrather
Pythonpersonal
Pythontask
Pythoncheckout
Pythoncontact
Pythonhard
Pythonrandom
Pythonreset
Pythonhelper
Pythonmakefile
Pythoncontinue
Pythongeneration
Pythonvisual
Pythondesired
Pythonxx
Pythonworld
Pythonnodejs
Pythoneasier
Pythonla
Pythonenables
Pythonarchitectures
Pythonprefer
Pythonunity
Python2004x
Pythongroups
Pythonexperimental
Python32bit
Pythonask
Python80
Pythonprevent
Pythonpython3
Pythonenjoy
Pythonbootstrap
Pythontoken
Pythonfuture
Pythonutility
Pythontdstringtd
Pythoncertificates
Pythonyoutubedl
Pythononline
Pythondepends
Pythonsnap
Pythonexperience
Pythonsr
Pythonlikely
Pythonchromium
Pythonthree
Python167
Pythontouch
Pythonlearn
Pythonsuite
Pythonones
Pythontimezone
Pythonremember
Pythonwhole
Pythonwrong
Pythonredis
Pythonaudit
Pythonwriting
Pythonjson
Pythonmissing
Pythoncontributors
Pythonls
Pythonreinforcement
Pythonsel

Pythonproduct
Pythonps
Pythonarray
Pythondrop
Pythonmerge
Pythonpriority
Pythonincrease
Pythonopenbox
Pythonwanted
Python17
Pythonrwrr
Pythondebops
Pythondemo
Pythonwatch
Pythonrights
Pythonregister
Pythondiscord
Pythontouchpad
Pythonxenial
Pythonbackups
Pythonpasswords
Pythonpdf
Pythonwordpress
Pythonunique
Pythonnothing
Pythonfilter
Pythonclock
Pythonassuming
Pythoncharacters
Pythondonate
Pythonsensors
Pythonshall
Pythonrelative
Pythondays
Pythonconf
Pythonvms
Pythongot
Pythonelasticsearch
Pythoncollected
Pythoncontributions
Pythonfun
Pythonpowershell
Pythonupgraded
Python19
Pythonetcfstab
Pythonoverridden
Pythoncharacter
Pythonfakeroot
Pythonrefresh
Pythonresize
Pythonpid
Pythonmeta
Pythontpm
Pythondark
Pythonlinked
Pythonresource
Pythonindicator
Pythonpypi
Pythonhope
Pythoncodeblock
Pythonscan
Pythontweaks
Pythonfetch
Pythongames
Pythontransaction
Pythontrusty
Pythonappimage
Pythonfoundation
Pythonoptimized
Pythonpreview
Pythonimmediately
Pythondevnull
Pythonevents
Pythontowards
Py

Pythonapache2
Pythonconky
Pythonemulator
Pythonproc
Pythonunable
Pythonbsd
Pythonrelay
Pythonbigger
Pythonopenmp
Pythonetcaptsourceslist
Pythonisolated
Pythonrobustness
Pythonemacs
Pythondeployed
Pythonadapt
Pythonaccordingly
Pythonsquid
Pythondouble
Pythonobtain
Pythonspecifying
Pythoncanonical
Pythonvendor
Pythonhosted
Pythonbunch
Pythonchip
Pythonautoconf
Pythonmask
Pythonrouting
Pythontries
Python42
Pythontex
Pythonlots
Pythonmanjaro
Pythonuid
Pythonarcolinux
Python33
Pythonoffers
Pythonpacmankey
Pythonseem
Pythonlatex
Pythoncrash
Pythonconvenient
Pythonproprietary
Pythonearly
Pythonconfigurable
Pythonpossibly
Pythonmkinitcpio
Pythonbzip2
Pythontaking
Pythonadb
Pythonlicence
Pythoninsecure
Python2013
Pythongimp
Pythonfstab
Pythonadaptive
Pythonnovnc
Pythonmeaning
Pythonarchlinuxcn
Pythonphpmyadmin
Pythonsshd
Pythonapplies
Pythondelay
Pythonopencv
Pythondunst
Pythonprints
Pythonactivated
Pythonnamespace
Pythonfiletxt
Pythonupdatercd
Pythonhelpful
Python20201223
Pythonguidelines
Pyth

Pythonele
Pythonlaunching
Pythondifferences
Pythonsituation
Pythonencounter
Pythonwent
Pythonservidores
Pythonparsecmgmt
Pythonresume
Pythonrequested
Pythondiscovery
Pythonstar
Pythonvalidate
Pythonodroid
Pythonafterwards
Pythoncaused
Pythonsymbolic
Pythonseparately
Pythonprimeiro
Pythonsymlinks
Pythonsmtp
Pythonatom
Pythonpostinstall
Pythoncodigo
Pythonbot
Pythonstig
Pythonnova
Pythonminecraft
Pythonesp8266
Pythonpythonpip
Pythoncustomizar
Pythonmod
Pythoncut
Pythondxvk
Pythonreproducible
Pythoncompared
Pythonmisc
Pythontalk
Pythonscsi
Pythonlsmod
Pythonrenamed
Pythoncustomizable
Pythonexact
Pythoncups
Pythonvagrantfile
Pythonhierarchical
Pythonisos
Pythonaffect
Python29
Pythonalacritty
Pythondhvirtualenv
Pythonvisudo
Pythonconst
Pythonsuperuser
Pythonstay
Pythontheyre
Pythonarising
Pythonreprepro
Pythontransparent
Pythonexclude
Pythonlove
Pythoncecos
Pythonelements
Pythonco
Pythonideal
Pythonmd5
Pythonrecords
Pythonrepoctl
Pythoninteger
Pythoncircleci
Pythonmanages
Pythontodos
Python

Pythoneffects
Pythonstartx
Pythonvoid
Pythonxorriso
Pythonpicture
Pythonnftables
Pythonp7zip
Pythonflat
Pythonpatterns
Pythonfinding
Pythonanswers
Pythonfusion
Pythonmicrosd
Pythonmetric
Pythoniface
Pythonsubsystem
Pythonwildcard
Pythonweight
Python450
Pythonchroots
Pythonb3
Python116
Pythonnvm
Pythondialogue
Pythonpulling
Pythonpythondev
Pythonlegal
Pythonfastboot
Pythondh
Pythonrecompile
Pythondockerfiles
Pythonhot
Pythonatualizar
Pythontrash
Pythoncontrollers
Pythonsegundo
Pythoninterpreter
Pythonpacback
Pythonwhite
Pythonalongside
Pythonrev
Pythonpreference
Pythonrubygems
Pythonrequirementsyml
Pythonciphers
Pythonformatting
Pythonetcgroup
Pythoncertbot
Pythonpartial
Python_vps_
Pythonrelatively
Pythonelapsed
Pythoncss
Pythonetcd
Pythonasf
Pythonseu
Pythonwpa_supplicant
Pythonrollback
Pythonextracting
Pythonwheels
Pythongreatly
Pythonwww
Pythonautomations
Pythoncommunicate
Pythonlemp
Pythonvhost
Pythonplugged
Pythonescape
Pythontomcat9
Pythonadjusting
Pythonpode
Pythoncargo
Pythonsc

Pythonrebuilding
Pythonoldconfig
Pythonstated
Pythongitannex
Pythondeclared
Pythonro
Pythonfinds
Pythonsamsung
Python123
Pythonacquire
Pythonnettools
Pythonalsa
Pythonsubstitute
Pythontermite
Pythontriggered
Pythonmetal
Pythonlongterm
Pythonrabbitmq
Pythonpassenger
Pythonenergy
Pythondevsdb
Pythonsubfolder
Pythoncaveats
Pythoncircle
Pythonprinter
Pythonce
Pythonradio
Pythongrubmkconfig
Pythonstopping
Pythonhostnames
Pythoncarry
Pythonappreciated
Pythonsandbox
Pythonmess
Pythonsequential
Pythonreasoning
Pythonuncertainty
Pythonfirejail
Pythonminpoll
Pythonmiscellaneous
Pythonhardcoded
Pythonlxde
Pythonecdh
Pythonnode_modules
Pythonnmap
Pythondirs
Pythonauf
Pythonbool
Pythonwarn
Pythonhopefully
Pythonlz4
Pythonesse
Pythonapt2ostree
Pythoninterruption
Pythonsun
Pythonxtom
Python118
Pythonpatent
Pythongitbuildpackage
Pythonupdatesh
Pythoni3wm
Pythonquota
Pythonsssdconf
Pythonbacked
Pythonassumed
Pythonbfcbc
Pythonrely
Pythonoverridesaddinglocaloverrides
Pythondialogs
Pythonmultithreaded
Py

Pythonexplaining
Pythonocs
Pythongeometry
Pythonst
Pythonpersist
Pythoncurve
Pythoncancelling
Pythonrotate
Pythonvivaldi
Pythonooutputdir
Pythonremix
Pythonconfigura
Pythontuned
Pythonrecursos
Pythondatasets
Pythoncalibre
Pythonskills
Pythonversioned
Pythonsake
Pythonsidebar
Pythonalert
Pythoncompiles
Pythonpep8
Pythonstays
Pythonmiddle
Pythonpureftpd
Pythonalpm_release
Pythonclearly
Pythonprocessed
Pythondevmmcblk0
Pythontypeyourfilename
Pythonsingularity
Pythonduo
Pythondelimiter
Pythonsynthesis
Pythonconverted
Pythonnoah
Pythoninitd
Pythonlargescale
Pythonretina
Pythonconfig_kallsyms
Pythoncuz
Pythonrecognized
Pythonterminator
Pythonav
Pythonhetzner
Pythonregression
Pythonswapon
Pythonsynapse
Pythonthinkpad
Pythonmkisofs
Pythonsaveenv
Pythoncoprocessor
Pythonarchdi
Python_________________
Pythonslurmlocalhost
Pythonencountered
Pythonobjetivo
Pythonkeywords
Python1500
Pythonjun
Pythonpreseedcfg
Pythongif
Pythondef
Python133
Pythonwatchdog
Pythonenforce
Pythonipython
Pythontexto
Pytho

Cencryption
Ccontributing
Ctodo
Cgcc
Clooking
Cfeel
Ceditor
C24
Cem
Cgpu
Cstored
Ctraining
Ccompiling
Cproperly
Cshared
Cimport
Ctakes
Cln
C_
Clicensed
Cseconds
Cmove
Cdistributed
Cmap
Cfonts
Cbluetooth
Calt
Cunit
Cpush
Cextract
Cwebsite
Cscreenshots
Cbinaries
Cbooting
Cchannel
Cresources
Creason
Creadme
Cdevelopers
Cunless
Cwine
Cminutes
Cplugins
Cdevops
Cview
Cbar
Calias
Cdownloads
Cpost
Cblock
Ctcp
Cconnections
Cdefines
Cdependency
Cauthor
Cdepending
Cflag
Ccurso
Coverride
Cactive
Cuefi
Crepos
Ccomponents
Cfailed
Cperform
Cipv6
Calong
Cthats
Cyay
Ccustomize
Cnever
Cavoid
Cmakepkg
C16
Cmail
Cstack
Cothers
Crefer
Cguest
Cthemes
Ccompiled
Cactually
Cstructure
Cupdating
Cappropriate
Cforce
Csnapshot
Cmatch
C32
Cempty
Cworkspace
Cresults
Csite
Csync
Cnoconfirm
Csolution
Cxfce
Ctell
Cmirrors
Ccompression
Climited
Cconsider
Creading
Cgoing
Cverbose
Cargument
Cfinally
Cmounted
Cstatic
Calternatively
Ctimeout
C15
Cexists
C13
Cx86_64
Caws
Cpermission
Ciptables
Cchoice
Ccompatible
Cday
Cinstal

Cbooted
Csections
Cxournal
Chit
Cblack
Cdkms
Csort
Csequence
Cbundle
Ccompleted
Cetcpacmanconf
Cprocessing
Cdisplays
Cparted
Ccodes
Cspotify
Ccenter
C200
Cdebianbased
Cresponsible
Cdestination
Cincoming
Carmhf
Cmailing
Cduplicate
Cfit
Cv2
Cforeground
Cattacks
Cowner
Cprogramming
Csensor
Cfoo
Cfact
Cscheme
Cproxmox
Clabel
Cpkg
Cofficially
Csplit
Chosting
Canswer
Cbuiltin
Ctftp
C512
Ckill
Cpostfix
Csyslog
Caddons
Cvalidation
Cumount
Cexecutar
Cbionic
Cfpm
Cpool
Ctwitter
Cearlier
Cspecification
Ccontrols
Crepositorio
Cdepend
Cmatches
Ccores
Chandling
Carchinstall
Cforum
Ccmake
Cperformed
Cx64
Cyaourt
Caccounts
Cnosso
Cgdb
Cmate
Cxfce4
Cdialog
Cgame
Cscope
Cpreseed
Csuggested
Cgitlab
Cregion
Caware
Crsyslog
Cios
Cdonation
Cdiff
C35
Cprompts
Ch3
Ctried
Cplasma
Calgorithms
Cdisks
Csomeone
Curi
Ccomparison
Csql
Csharing
Ccouple
Cexpression
Csi
Coverrides
Cmerged
Ctrust
Csyu
Cuniversal
Cshift
Cembedded
Cair
Cutc
Cdocumented
Cunpack
C2016
Crsync
Cpick
Cmodifying
Cwheezy
Cplaylist
Cdevelop
Climi

Casus
Cwlan0
C1910
C180
Clisting
Cportable
Czabbixserver
Cchar
Cscripting
Cacessar
Cbootgrubgrubcfg
Ch2
Cpermit
Csaving
Cideas
Cpublishing
C2010
Cinitially
Cnumeric
Crake
Cprocedures
Cthreads
Cexits
Cgparted
Cxdebug
Cdebuild
Cmicrocode
Cworry
Cmonth
Cmenuconfig
Crepeat
Csemantic
Chdd
C800
Cstochastic
Cdevtools
Crequiring
Cstages
Cplaying
Ctechnology
Cexposed
Ckeeping
Cinstallers
Ccover
Cstores
Caligncenterimg
Cutilizando
Cpyenv
Cvarwww
Cfossa
Cnvram
Cstopped
Cpostgres
Clost
Cperiod
Cforked
C0000000
Clogfile
Cyn
Cdevshm
Cag
Csocial
Cbrbr
Capplicable
Cbook
Cpotential
Cpowerpc
Csafety
Climitation
Ctoday
Cdestroy
Calpha
Cremovable
Cubuntudebian
Cdash
Cowncloud
Cqueries
Cpin
Cgreen
Cns
Capple
Cguarantee
Cgid
Cframeworks
Cclosed
Cthread
Cgplv3
Cbuy
Csensitive
C51
Cuserland
Cexchange
Cobviously
Cauthorized
Clibtool
Comitted
Cpane
Cwidget
Cextend
Cele
Claunching
Cdifferences
Csituation
Cencounter
Cwent
Cservidores
Cparsecmgmt
Cresume
Crequested
Cdiscovery
Cstar
Cvalidate
Codroid
Cafterwards
Cc

Cnearly
Copensshserver
Cswitched
Cgroupadd
Cputting
Cugly
Cends
Csysvinit
Csnaps
C608080
Cbs1m
Cbesides
Cpython3pip
Cwal
Cprojeto
Crealtime
C65536
Cfingerprint
Cburn
Cofdevsdx
Cstructured
Crequirementsrequirements
Cexcellent
Cmbr
Cuploading
Cbeginners
Cincreased
Cgap
Cauthenticator
Cfilesconfig
Celement
Clinear
Cbdist_deb
Clint
Creboots
Captmetalink
Cfeed
Caimg
Creads
Cbootstrapsh
Cbootargs
Cgnuroot
Cinactive
Cfactors
Ccvescan
Cthunderbird
Cmotd
Cexpanded
Cbookmarks
Csvn
Ccollect
Cpasswordless
Csystemdboot
Ccircumstances
Cperspective
Cstands
Cyellow
Cfewshot
Cdebpkg
Ccontributingmdcontributingmd
Codd
Cchance
Cpermanently
Cdatastax
Csuru
Cpresented
Cswappiness
Cjournald
Ctokens
Cadsysd
Cmetadebian
Cbootp
C4x
Coverwritten
Csardi
Cneeding
Clinode
Chd
Creturned
Cxubuntudesktop
Cpushed
Cumaskumask
Cpersons
Castos
Cupper
Crocky
C52
Clocalization
Ctriggers
Cusefull
Cflavors
Cist
Cinstallable
Cincremental
Cscenario
Cdebians
Cbugfix
Crestrict
Cfur
Cfurthermore
Cpavucontrol
Coverhead
Cvary
Cctrl

Csingularity
Cduo
Cdelimiter
Csynthesis
Cconverted
Cnoah
Cinitd
Clargescale
Cretina
Cconfig_kallsyms
Ccuz
Crecognized
Cterminator
Cav
Chetzner
Cregression
Cswapon
Csynapse
Cthinkpad
Cmkisofs
Csaveenv
Ccoprocessor
Carchdi
C_________________
Cslurmlocalhost
Cencountered
Cobjetivo
Ckeywords
C1500
Cjun
Cpreseedcfg
Cgif
Cdef
C133
Cwatchdog
Cenforce
Cipython
Ctexto
C101
Cpurifiers
Clocking
Cclusters
Chost_vars
Csrv
Cpacoloco
Clxappearance
Cgenisoimage
Cpatient
Cstatusprogress
Cperformant
Clinus
Cdebianversion
Cwo
Caim
Cxboxdrv
Cresets
Cdevpts
Cprivilegedtrue
Cremains
Caes
Cpgsql
Cpowerline
Crestored
Cdocumentationchanges
Cmeld
Coffending
Cenabledisable
Clinux1
Cappjs
Cretry
Cnextcloud
Cbasics
Cbugfixes
Czabbix_server_database_long
Cverbosity
Cxauthority
Crxvtunicode
Croutines
Cv7
Cplymouth
Cconhecendo
Cimplementations
Cprincipal
Cgettext
Cetcliloconf
Clnd
C2021li
Cthreading
Cspawn
Cbrackets
Cwatching
Cgreeter
Cbaseline
Csong
Csoftwarepropertiescommon
Crecursesubmodules
Cdebianpiaarch64
Cstem

Othersgoogle
Othersandroid
Othersbasic
Othersruns
Otherslot
Otherszsh
Othersrules
Otherscomo
Othersmacos
Othersload
Otherscommit
Othersroles
Othersrequire
Othersdhcp
Othersplace
Otherspip
Othersgiven
Othersdirectories
Otherschanged
Otherspresent
Othersdate
Others7x
Othersminimal
Othersicons
Otherssave
Othersmirror
Others1604
Otherslevel
Othersapps
Othersmit
Others100
Othersstarting
Otherspackaging
Othersperformance
Others233
Otherscommon
Otherscontent
Othersoriginal
Othersfirewall
Otherscorrect
Otherseasily
Othersram
Othersdistributions
Othersanything
Otherscomputer
Othersmachines
Othersvarious
Otherslaunch
Othersswitch
Otherslogs
Others64bit
Othersmessage
Otherssed
Othersemail
Othersgetting
Othersresult
Otherspacker
Othersnamed
Othersgive
Othersmedia
Othersprograms
Otherstar
Otherslast
Otherscat
Otherssend
Othersconfiguracao
Others600
Othersie
Othersmethod
Othersplatform
Othersparticular
Otherscontroller
Othersexecutable
Othersexec
Othersul
Otherssession
Othersprint
Othersyet
Othersle

Othersdocument
Othersfolders
Otherspassed
Othersstick
Otherstutorial
Others__
Othersfaster
Otherslauncher
Othersconvert
Othersdetailed
Otherstoptableofcontents
Othersnetworkmanager
Othersstudio
Othersprogress
Othersopenssl
Othersutilizado
Othersfilename
Othersj
Otherslimit
Otherspostgresql
Othersdeployment
Othersremoving
Othersignore
Othersactivate
Othersentire
Othersexcept
Otherstls
Otherstargets
Othersphysical
Othersheadless
Othersvps
Othersnative
Othersinterfaces
Othersipv4
Othersmaintained
Othersnice
Otherscron
Othersplatforms
Others2019
Othersunzip
Othersmatching
Othersoptimization
Otherscopied
Othersallowed
Othersform
Otherstypes
Otherstimes
Otherskvm
Othersdaily
Otherslock
Othersprovider
Othersgtk
Othersbinbash
Otherszfs
Othersdownloading
Othersheavy_check_mark
Otherskind
Othersimprove
Othersyoutube
Othersgb
Othersquickly
Otherspaths
Othersrecovery
Othersarquivo
Othersways
Othersthus
Othersbecome
Otherscuda
Otherscourse
Otherstotal
Othersquality
Othersrest
Otherslvm
Othersforget

Othersnews
Othersdisabling
Othersqq
Othersopening
Otherspassing
Othersguides
Otherslocales
Otherschromebook
Othersfront
Othersprivacy
Otherses
Othersplayer
Othersextracted
Othersanalysis
Othersenterprise
Otherssigned
Othersitems
Othersaims
Othersdetermine
Othersffmpeg
Othersanymore
Othersevent
Othersxxxxxxxx
Othersapparmor
Otherssending
Otherstrack
Othersautostart
Othersnotification
Others34
Others5x
Othersthank
Othersboolean
Othersmandatory
Othersopensource
Othersxrandr
Othersheres
Othersusermod
Othersbauh
Otherssway
Othersnewly
Othersgradient
Otherssummary
Othersbooted
Otherssections
Othersxournal
Othershit
Othersblack
Othersdkms
Otherssort
Otherssequence
Othersbundle
Otherscompleted
Othersetcpacmanconf
Othersprocessing
Othersdisplays
Othersparted
Otherscodes
Othersspotify
Otherscenter
Others200
Othersdebianbased
Othersresponsible
Othersdestination
Othersincoming
Othersarmhf
Othersmailing
Othersduplicate
Othersfit
Othersv2
Othersforeground
Othersattacks
Othersowner
Othersprogramming


Othersvariational
Othersattribute
Othersgrant
Others433
Otherslength
Othersupstart
Othersconvenience
Othersbare
Othersregenerate
Othersrebuilt
Othersbasico
Othersnight
Othersattempting
Otherspassphrase
Othersadsysctl
Otherscinnamon
Othersconcept
Otherspackaged
Othersgpt
Otherscomposer
Otherslogical
Othersv1
Othersslightly
Otherslo
Othersassets
Othersactivation
Otherspersistent
Othersros
Othersjdk
Othersclassic
Otherss3
Othersirc
Othersdock
Othersmpv
Othersz
Othersbinsh
Othersnautilus
Otherssid
Othersphpfpm
Othersmin
Othersequivalent
Otherssubject
Othersdifficult
Othershack
Othersreader
Othersxxxx
Othersoptimize
Othershandles
Others0017caf73818
Otherskept
Othersmatter
Othersplayback
Othersphone
Othersmodification
Othersdeploying
Othersguacamole
Othersattach
Otherseip
Otherspossibility
Others56
Othersinet
Othersalarm
Othersetcresolvconf
Otherswebserver
Othersadwatchd
Othersmanagers
Othersalma
Othersesp
Othersfloating
Otherslan
Othersconsistent
Otherswide
Othersbridge
Othersdefault_
Other

Others70
Otherstechnical
Othersstreaming
Otherssl
Otherssymbols
Othersuna
Othersarchchroot
Otherssane
Othersdonations
Otherssha256
Othersplank
Othersdefinitions
Othersgoals
Othersfrequently
Othersdynamically
Othersrecognition
Othersheat
Othersmonitors
Othersdigitalocean
Othersathena
Otherstraditional
Othersactively
Otherscommonly
Othersbreaking
Otherswritable
Othersax
Othersrestriction
Others41
Othersconversion
Othersooniprobe
Otherstrigger
Othersle
Othersasks
Othersloopback
Othersvon
Othersweek
Otherscert
Othersvisible
Otherssmart
Othersentering
Otherspbuilder
Othersarmv7
Othersssid
Othersdesktops
Othersstaging
Othersitll
Otherschannels
Othersbumblebee
Otherscapable
Othersmetrics
Otherspeer
Othersabsolutely
Othersstats
Othersbenefit
Othersgraphic
Otherstdbooleantd
Otherssoft
Othersconditional
Othersyear
Othersfiltering
Othersrendered
Othersipsec
Otherssystemwide
Otherscaution
Otherscapacity
Otherstopic
Othershints
Othersexplicit
Othersported
Othersldap
Otherscycle
Othersauthorization


Otherswheels
Othersgreatly
Otherswww
Othersautomations
Otherscommunicate
Otherslemp
Othersvhost
Othersplugged
Othersescape
Otherstomcat9
Othersadjusting
Otherspode
Otherscargo
Othersscrolling
Others2nd
Otherspreparation
Othersrust
Othersreduces
Othersxubuntu
Othersninja
Othersadvised
Othersreleaseh3
Othersabort
Othersoneinstack
Otherschose
Othersinstala
Othersperhaps
Othersprofessional
Others3b
Othersmaps
Othersjournal
Othersmacosmojave
Othersopenvpninstallsh
Othersmips
Othersblocked
Othersholders
Othersstrict
Othershyperv
Othersdevsda3
Otherstile
Othersfn
Otherspopup
Othersforever
Othersdropbox
Others46
Otherstlscrypt
Othersaccessed
Othersthirdparty
Othersconflict
Othersrelies
Othersnatively
Othersartifactory
Othersdisc
Otherslinking
Othersrecipes
Othersjvm
Othersdetailssummary
Othersunset
Othersnodev
Otherstables
Othersprivilege
Othersbootm
Otherscac
Othersohomenamebuildkernel
Otherslivebuild
Othersrecvkeys
Others3rd
Othersweather
Othersprinted
Otherspacaur
Othersgerenciamento
Others

Otherscircle
Othersprinter
Othersce
Othersradio
Othersgrubmkconfig
Othersstopping
Othershostnames
Otherscarry
Othersappreciated
Otherssandbox
Othersmess
Otherssequential
Othersreasoning
Othersuncertainty
Othersfirejail
Othersminpoll
Othersmiscellaneous
Othershardcoded
Otherslxde
Othersecdh
Othersnode_modules
Othersnmap
Othersdirs
Othersauf
Othersbool
Otherswarn
Othershopefully
Otherslz4
Othersesse
Othersapt2ostree
Othersinterruption
Otherssun
Othersxtom
Others118
Otherspatent
Othersgitbuildpackage
Othersupdatesh
Othersi3wm
Othersquota
Otherssssdconf
Othersbacked
Othersassumed
Othersbfcbc
Othersrely
Othersoverridesaddinglocaloverrides
Othersdialogs
Othersmultithreaded
Othersnotebook
Othersethereum
Others2011
Othersapis
Otherssftp
Othersnicely
Othersdockerio
Othersage
Othersdebs
Othersnecessarily
Othersincompatible
Othersadjusted
Othersrespond
Othersfalar
Othersreached
Othersdeletes
Othersupdategrub
Othersadminer
Others2552552550
Othersmultistrap
Othersvarch
Othersrpc
Othersdu
Othersfini

Othersstartstopstatusrestartreload
Others58
Otherspackage_name
Otherswherever
Othersspend
Othersmkimage
Otherssyn
Othersnotation
Otherstechniques
Otherslate
Othersflex
Othersstruct
Otherskit
Othersseriously
Othersarcharchconfigsplatform_defconfig
Othersinstagram
Othersrank
Otherspmmu
Otherssyncthing
Othersjetson
Otherszathura
Othersnamelist
Othersrefind
Othersfzf
Otherszabbix_server_database
Othersmd5sum
Others1070
Otherscompressing
Otherstemporal
Othersdecryption
Othersexplaining
Othersocs
Othersgeometry
Othersst
Otherspersist
Otherscurve
Otherscancelling
Othersrotate
Othersvivaldi
Othersooutputdir
Othersremix
Othersconfigura
Otherstuned
Othersrecursos
Othersdatasets
Otherscalibre
Othersskills
Othersversioned
Otherssake
Otherssidebar
Othersalert
Otherscompiles
Otherspep8
Othersstays
Othersmiddle
Otherspureftpd
Othersalpm_release
Othersclearly
Othersprocessed
Othersdevmmcblk0
Otherstypeyourfilename
Otherssingularity
Othersduo
Othersdelimiter
Otherssynthesis
Othersconverted
Othersnoah
O

In [20]:
percentages = pd.DataFrame({"top_5000_words": top_1000.index, 'shell': shell_percentages, 'python': python_percentages, 'c': c_percentages, 'other_langs': other_lang_percentages})

In [21]:
percentages.head()

Unnamed: 0,top_5000_words,shell,python,c,other_langs
0,install,0.5982,0.6159,0.5957,0.5457
1,use,0.5375,0.5952,0.5887,0.5547
2,run,0.5438,0.5606,0.5177,0.5367
3,file,0.3961,0.519,0.5248,0.4698
4,script,0.4781,0.2215,0.2766,0.2188


In [22]:
percentages['range'] = None
for row in range(len(percentages)):
    max = 0
    min = 1
    for column in percentages.drop(columns = ['top_5000_words', 'range']).columns:
        if percentages[column][row] > max:
            max = percentages[column][row]
        if percentages[column][row] < min:
            min = percentages[column][row]
    percentages['range'][row] = (max - min)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  percentages['range'][row] = (max - min)


In [23]:
percentages[['top_5000_words', 'range']][percentages.range < .01].top_5000_words.to_json('high_freq_stopwords.json')

In [24]:
percentages[['top_5000_words', 'range']][percentages.range < .01].top_5000_words

149          tdtd
159             v
161      learning
181        intelr
354          para
          ...    
4994     thinking
4995    restarted
4996     numerous
4997         rare
4999          128
Name: top_5000_words, Length: 1644, dtype: object

In [25]:
percentages[['top_5000_words', 'range']].sort_values(by = 'range', ascending = False).head(179)

Unnamed: 0,top_5000_words,range
105,python,0.4554
11,kernel,0.2729
4,script,0.2593
391,compile,0.2347
780,gcc,0.2315
...,...,...
705,less,0.1002
847,never,0.1001
813,reason,0.1001
20,files,0.1001


In [26]:
percentages

Unnamed: 0,top_5000_words,shell,python,c,other_langs,range
0,install,0.5982,0.6159,0.5957,0.5457,0.0702
1,use,0.5375,0.5952,0.5887,0.5547,0.0577
2,run,0.5438,0.5606,0.5177,0.5367,0.0429
3,file,0.3961,0.5190,0.5248,0.4698,0.1287
4,script,0.4781,0.2215,0.2766,0.2188,0.2593
...,...,...,...,...,...,...
4995,restarted,0.0056,0.0069,0.0071,0.0051,0.002
4996,numerous,0.0056,0.0104,0.0071,0.0039,0.0065
4997,rare,0.0075,0.0035,0.0071,0.0039,0.004
4998,iphone,0.0019,0.0000,0.0142,0.0039,0.0142
