In [3]:
#import tools for web scraping:
from requests import get
from bs4 import BeautifulSoup
import os
import pandas as pd

#group imports
import env
import acquire
import os
import json
from typing import Dict, List, Optional, Union, cast
import requests

In [38]:
#looking at one of our urls: 
url = "https://github.com/search?o=desc&q=archlinux&s=forks&type=Repositories"
headers = {"Authorization": f"token {env.github_token}", "User-Agent": env.github_username}
response = get(url, headers=headers)

In [39]:
#creating beautiful soup and html parser:
soup = BeautifulSoup(response.content, 'html.parser')

In [40]:
# see also `soup.find_all`
#
# beautiful soup uses `class_` as the keyword argument for searching
# for a class because `class` is a reserved word in python
# we'll use the class name that we identified from looking in the inspector in chrome
link = soup.find('a', class_='v-align-middle')
link.text

AttributeError: 'NoneType' object has no attribute 'text'

#### Need to make these a list:

In [6]:
#pulling 1100 repos from Ubuntu 
def github_api_query(page):
    url = f"https://api.github.com/search/repositories?q=ubuntu&s=forks&page={page}&per_page=100"
    response = requests.get(url, headers=headers)
    return response.json()


urls = []
c = 0
while True:
    print(f"page{c}")
    try:
        req = github_api_query(page=c)
        for item in req['items']:
            urls.append(item['html_url'])
        c += 1
    except:
        break

page0
page1
page2
page3
page4
page5
page6
page7
page8
page9
page10
page11


In [7]:
#creating a series of the url links:
u_links= pd.Series(urls)

In [8]:
#creating a column name for links
links = {'links':u_links }

In [9]:
#taking the series and creating a dataframe:
ubuntu = pd.DataFrame(links)

In [10]:
ubuntu

Unnamed: 0,links
0,https://github.com/dockerfile/ubuntu
1,https://github.com/boxcutter/ubuntu
2,https://github.com/wszqkzqk/deepin-wine-ubuntu
3,https://github.com/fcwu/docker-ubuntu-vnc-desktop
4,https://github.com/docker-32bit/ubuntu
...,...
1095,https://github.com/kurniawandata/nasihosting
1096,https://github.com/hortonworks/docker-protractor
1097,https://github.com/mastermindg/trac-docker-ubuntu
1098,https://github.com/rubiojr/surface3-kernel


In [11]:
#sending ubuntu to a csv file:
ubuntu.to_csv('ubuntu_repo_link', index=False)

In [12]:
#trying out reading the csv:

ubuntu_df = pd.read_csv('ubuntu_repo_link.csv')

In [13]:
#there she be:
ubuntu_df

Unnamed: 0,links
0,https://github.com/dockerfile/ubuntu
1,https://github.com/boxcutter/ubuntu
2,https://github.com/wszqkzqk/deepin-wine-ubuntu
3,https://github.com/fcwu/docker-ubuntu-vnc-desktop
4,https://github.com/docker-32bit/ubuntu
...,...
1095,https://github.com/kurniawandata/nasihosting
1096,https://github.com/hortonworks/docker-protractor
1097,https://github.com/mastermindg/trac-docker-ubuntu
1098,https://github.com/rubiojr/surface3-kernel


_____________________________________________________________________________

## Exploring Ubuntu:

In [5]:
ubuntu = pd.read_json('ubuntu_data.json')

In [6]:
ubuntu.head()

Unnamed: 0,repo,language,readme_contents
0,dockerfile/ubuntu,Shell,## Ubuntu Dockerfile\n\n\nThis repository cont...
1,boxcutter/ubuntu,Shell,# Packer templates for Ubuntu written in legac...
2,wszqkzqk/deepin-wine-ubuntu,C,# Deepin wine for Ubuntu and Debian\n\n## 一、项目...
3,fcwu/docker-ubuntu-vnc-desktop,HTML,# docker-ubuntu-vnc-desktop\n\n[![Docker Pulls...
4,docker-32bit/ubuntu,Shell,ubuntu\n======\n\nBuild a docker image for ubu...


#### Normalizing the data:

In [7]:
import unicodedata
import re

import nltk
from nltk.tokenize.toktok import ToktokTokenizer
from nltk.corpus import stopwords

In [15]:
#to view just one of the readme docs and see what kind of normalizing is needed:
soup = BeautifulSoup(ubuntu.readme_contents.iloc[10], 'html.parser')
soup

Ubuntu 18.04 CIS STIG

[![Build Status](https://travis-ci.com/florianutz/Ubuntu1804-CIS.svg?branch=master)](https://travis-ci.com/florianutz/Ubuntu1804-CIS)
[![Ansible Role](https://img.shields.io/badge/role-florianutz.Ubuntu1804--CIS-blue.svg)](https://galaxy.ansible.com/florianutz/Ubuntu1804-CIS/)

Configure Ubuntu 18.04 machine to be CIS compliant. Level 1 and 2 findings will be corrected by default.

This role **will make changes to the system** that could break things. This is not an auditing tool but rather a remediation tool to be used after an audit has been conducted.

## IMPORTANT INSTALL STEP

If you want to install this via the `ansible-galaxy` command you'll need to run it like this:

`ansible-galaxy install -p roles -r requirements.yml`

With this in the file requirements.yml:

```
- src: https://github.com/florianutz/Ubuntu1804-CIS.git
```

Based on [CIS Ubuntu Benchmark v2.0.1 - 01-03-2020 ](https://www.cisecurity.org/cis-benchmarks/).

This repo originated from work do

Steps to parsing data:
- 1) Convert text to all lower case for normalcy.
- 2) Remove any accented characters, non-ASCII characters.
- 3) Remove special characters.
- 4) Stem or lemmatize the words.(stem = "if b, then c")
- 5) Remove stopwords.(if, and, the, etc)
- 6) Store the clean text and the original text for use in future notebooks.

In [16]:
#used to clean and normalize a string:
def basic_clean(string):
    """A function that uses NLTK to clean and normalizes a string"""
    string = string.lower()
    string = unicodedata.normalize('NFKD', string).encode('ascii', 'ignore').decode('utf-8', 'ignore')
    string = re.sub(r"[^a-z0-9'\s]", '' ,string)
    return string

In [20]:
#let's test this out on the example:
example=ubuntu.readme_contents.iloc[10]
example



In [23]:
#testing function:
example=basic_clean(example)
example

"ubuntu 1804 cis stig\n\n\nbuild statushttpstraviscicomflorianutzubuntu1804cissvgbranchmasterhttpstraviscicomflorianutzubuntu1804cis\nansible rolehttpsimgshieldsiobadgeroleflorianutzubuntu1804cisbluesvghttpsgalaxyansiblecomflorianutzubuntu1804cis\n\nconfigure ubuntu 1804 machine to be cis compliant level 1 and 2 findings will be corrected by default\n\nthis role will make changes to the system that could break things this is not an auditing tool but rather a remediation tool to be used after an audit has been conducted\n\n important install step\n\nif you want to install this via the ansiblegalaxy command you'll need to run it like this\n\nansiblegalaxy install p roles r requirementsyml\n\nwith this in the file requirementsyml\n\n\n src httpsgithubcomflorianutzubuntu1804cisgit\n\n\nbased on cis ubuntu benchmark v201  01032020 httpswwwcisecurityorgcisbenchmarks\n\nthis repo originated from work done by mindpointgrouphttpsgithubcommindpointgrouprhel7cis\n\nrequirements\n\n\nyou should ca

In [24]:
def tokenize(string):
    """This function will take in a string, tokenize by breaking any leftover words into units and return 
    the tokenized string"""
    tokenizer = nltk.tokenize.ToktokTokenizer()
    
    string = tokenizer.tokenize(string, return_str=True)
    return string

In [26]:
#testing function:
example = tokenize(example)
example

"ubuntu 1804 cis stig\n\n\nbuild statushttpstraviscicomflorianutzubuntu1804cissvgbranchmasterhttpstraviscicomflorianutzubuntu1804cis\nansible rolehttpsimgshieldsiobadgeroleflorianutzubuntu1804cisbluesvghttpsgalaxyansiblecomflorianutzubuntu1804cis\n\nconfigure ubuntu 1804 machine to be cis compliant level 1 and 2 findings will be corrected by default\n\nthis role will make changes to the system that could break things this is not an auditing tool but rather a remediation tool to be used after an audit has been conducted\n\n important install step\n\nif you want to install this via the ansiblegalaxy command you ' ll need to run it like this\n\nansiblegalaxy install p roles r requirementsyml\n\nwith this in the file requirementsyml\n\n\n src httpsgithubcomflorianutzubuntu1804cisgit\n\n\nbased on cis ubuntu benchmark v201 01032020 httpswwwcisecurityorgcisbenchmarks\n\nthis repo originated from work done by mindpointgrouphttpsgithubcommindpointgrouprhel7cis\n\nrequirements\n\n\nyou should c

#### Should we stem or lemmatize?

In [30]:
def stem(string):
    """This function takes in a string and returns the stemmed version of string"""
    ps = nltk.porter.PorterStemmer()
    stems = [ps.stem(word) for word in string.split()]
    string = ' '.join(stems)
    
    return string

In [31]:
example2 = stem(example)

In [33]:
#testing out lemmatizer:
wnl = nltk.stem.WordNetLemmatizer()
ps = nltk.porter.PorterStemmer()

for word in example.split():
    print('stem:', ps.stem(word), '-- lemma:', wnl.lemmatize(word))

stem: ubuntu -- lemma: ubuntu
stem: 1804 -- lemma: 1804
stem: ci -- lemma: ci
stem: stig -- lemma: stig
stem: build -- lemma: build
stem: statushttpstraviscicomflorianutzubuntu1804cissvgbranchmasterhttpstraviscicomflorianutzubuntu1804ci -- lemma: statushttpstraviscicomflorianutzubuntu1804cissvgbranchmasterhttpstraviscicomflorianutzubuntu1804cis
stem: ansibl -- lemma: ansible
stem: rolehttpsimgshieldsiobadgeroleflorianutzubuntu1804cisbluesvghttpsgalaxyansiblecomflorianutzubuntu1804ci -- lemma: rolehttpsimgshieldsiobadgeroleflorianutzubuntu1804cisbluesvghttpsgalaxyansiblecomflorianutzubuntu1804cis
stem: configur -- lemma: configure
stem: ubuntu -- lemma: ubuntu
stem: 1804 -- lemma: 1804
stem: machin -- lemma: machine
stem: to -- lemma: to
stem: be -- lemma: be
stem: ci -- lemma: ci
stem: compliant -- lemma: compliant
stem: level -- lemma: level
stem: 1 -- lemma: 1
stem: and -- lemma: and
stem: 2 -- lemma: 2
stem: find -- lemma: finding
stem: will -- lemma: will
stem: be -- lemma: be
stem

stem: authent -- lemma: authentication
stem: requir -- lemma: required
stem: for -- lemma: for
stem: singl -- lemma: single
stem: user -- lemma: user
stem: mode -- lemma: mode
stem: it -- lemma: it
stem: is -- lemma: is
stem: disabl -- lemma: disabled
stem: by -- lemma: by
stem: default -- lemma: default
stem: as -- lemma: a
stem: it -- lemma: it
stem: is -- lemma: is
stem: set -- lemma: setting
stem: random -- lemma: random
stem: password -- lemma: password
stem: for -- lemma: for
stem: root -- lemma: root
stem: to -- lemma: to
stem: enabl -- lemma: enable
stem: it -- lemma: it
stem: set -- lemma: set
stem: yaml -- lemma: yaml
stem: ubuntu1804cisrule153 -- lemma: ubuntu1804cisrule153
stem: true -- lemma: true
stem: to -- lemma: to
stem: use -- lemma: use
stem: other -- lemma: other
stem: than -- lemma: than
stem: random -- lemma: random
stem: password -- lemma: password
stem: yaml -- lemma: yaml
stem: ubuntu1804cisrootpassword -- lemma: ubuntu1804cisrootpassword
stem: ' -- lemma: '
st

In [45]:
def lemmatize(string):
    """This function takes in a string and returns a lemmatized version of the string."""
    # create our lemmatizer object
    wnl = nltk.stem.WordNetLemmatizer()
    # use a list comprehension to lemmatize each word
    # string.split() => output a list of every token inside of the document
    lemmas = [wnl.lemmatize(word) for word in string.split()]
    # glue the lemmas back together by the strings we split on
    string = ' '.join(lemmas)
    #return the altered document
    return string

In [42]:
def remove_stopwords(string, extra_words = [], exclude_words = []):
    """This function takes in a string, applies stop_words to take out common stop words, includes any extra words
    wanted removed and excludes and stop words wanted kept"""
    #assign stopwords from nltk into a stopword_list:
    stopword_list = stopwords.words('english')
    #remove any excluded stopwords that are wanted to be kept:
    stopword_list = set(stopword_list) - set(exclude_words)
    #add on any other stopwords using a union
    stopword_list = stopword_list.union(set(extra_words))
    #split the words by spaces
    words = string.split()
    #filter out every word in dict unless in stop word:
    filtered_words = [word for word in words if word not in stopword_list]
    #put it back together with spaces
    string_without_stopwords = ' '.join(filtered_words)
    #return df back
    return string_without_stopwords

In [43]:
#sticking with lemmatizer and applying clean, lemmatizer, and stopwords:
def prep_article_data(df, column, ignore_columns = [], extra_words=[], exclude_words=[]):
    '''
    This function takes in a df and the string name for a text column with the
    option to pass lists for extra_words and exclude_words and
    returns a df with the text article title, original text, stemmed text,
    lemmatized text, cleaned, tokenized, & lemmatized text with stopwords removed.
    '''
    df['clean'] = df[column].apply(basic_clean)\
                            .apply(tokenize)\
                            .apply(remove_stopwords,
                                  extra_words=extra_words,
                                  exclude_words=exclude_words)
    
    df['stemmed'] = df['clean'].apply(stem)
    
    df['lemmatized'] = df['clean'].apply(lemmatize)

    cleaned_columns = [column, 'clean','stemmed','lemmatized']
    all_columns = ignore_columns + cleaned_columns


    return df[all_columns]


___________________________________

In [46]:
ubuntu_df = prep_article_data(ubuntu, 'readme_contents', ignore_columns = ['repo','language'], extra_words=[], exclude_words=[])

In [47]:
ubuntu_df.head()

Unnamed: 0,repo,language,readme_contents,clean,stemmed,lemmatized
0,dockerfile/ubuntu,Shell,## Ubuntu Dockerfile\n\n\nThis repository cont...,ubuntu dockerfile repository contains dockerfi...,ubuntu dockerfil repositori contain dockerfil ...,ubuntu dockerfile repository contains dockerfi...
1,boxcutter/ubuntu,Shell,# Packer templates for Ubuntu written in legac...,packer templates ubuntu written legacy json ov...,packer templat ubuntu written legaci json over...,packer template ubuntu written legacy json ove...
2,wszqkzqk/deepin-wine-ubuntu,C,# Deepin wine for Ubuntu and Debian\n\n## 一、项目...,deepin wine ubuntu debian deepinwine ubuntudeb...,deepin wine ubuntu debian deepinwin ubuntudebi...,deepin wine ubuntu debian deepinwine ubuntudeb...
3,fcwu/docker-ubuntu-vnc-desktop,HTML,# docker-ubuntu-vnc-desktop\n\n[![Docker Pulls...,dockerubuntuvncdesktop docker pullshttpsimgshi...,dockerubuntuvncdesktop docker pullshttpsimgshi...,dockerubuntuvncdesktop docker pullshttpsimgshi...
4,docker-32bit/ubuntu,Shell,ubuntu\n======\n\nBuild a docker image for ubu...,ubuntu build docker image ubuntu i386 run buil...,ubuntu build docker imag ubuntu i386 run build...,ubuntu build docker image ubuntu i386 run buil...
