# Week 6

## Part 1: Chapter 1 of the book

In [None]:
import nltk
import ssl

try:
    _create_unverified_https_context = ssl._create_unverified_context
except AttributeError:
    pass
else:
    ssl._create_default_https_context = _create_unverified_https_context

nltk.download("book")

In [None]:
from nltk.book import *

In [None]:
text1.concordance("monstrous")

In [None]:
text1.similar("monstrous")
text2.similar("monstrous")

In [None]:
text2.common_contexts(["monstrous", "very"])

In [None]:
text4.dispersion_plot(["citizens", "democracy", "freedom", "duties", "America"])

In [None]:
text3.generate()

In [None]:
len(text3)

In [None]:
sorted(set(text3))

In [None]:
len(set(text3))

In [None]:
len(set(text3)) / len(text3)

In [None]:
text3.count("smote")

In [None]:
100 * text4.count("a") / len(text4)

In [None]:
def lexical_diversity(text):
    return len(set(text)) / len(text)

def percentage(count, total):
    return 100 * count / total

In [None]:
lexical_diversity(text3)

In [None]:
lexical_diversity(text5)

In [None]:
percentage(4, 5)

In [None]:
percentage(text4.count("a"), len(text4))

## Part 2: Rapper network

In [None]:
import pandas as pd
import numpy as np
import networkx as nx
import requests
import os
import regex as re

In [None]:
df_rapper = pd.read_csv("../files/Rappers.csv")

WIKI_API_URL = "https://en.wikipedia.org/w/api.php"

params = {
    "action": "query",
    "format": "json",
    "exlimit": "1",
    "explaintext": "1",
    "prop": "extracts",
}

def get_wiki_page(title):
    params["titles"] = title
    response = requests.get(WIKI_API_URL, params=params)
    return response.json()

# Creation of the folder where we will store the pages' content
if not os.path.exists("../files/rapper_pages"):
    os.makedirs("../files/rapper_pages")

for rapper in df_rapper["WikipediaPageName"]:
    rapper_page = get_wiki_page(rapper).get("query").get("pages").popitem()[1].get("extract")
    with open(f"../files/rapper_pages/{rapper}.txt", "w") as f:
        f.write(rapper_page)

    


In [None]:
file_name_list = os.listdir("../files/rapper_pages")
#Reorder the list of files in alphabetical order
file_name_list.sort()

#Sort df_rapper by WikipediaPageName
df_rapper = df_rapper.sort_values(by="WikipediaPageName")
df_rapper = df_rapper.reset_index(drop=True)

In [None]:
#Create a new corpus with the rapper's pages
corpus_root = "../files/rapper_pages"
rapper_corpus = nltk.corpus.PlaintextCorpusReader(corpus_root, file_name_list)

text = nltk.Text(rapper_corpus.words())


In [None]:
text.concordance("feat")

In [None]:
text.similar("feat")

In [None]:
text.common_contexts(["feat", "featuring"])

In [None]:
text.dispersion_plot(["featuring", "drug", "song", "album"])

In [None]:
bigram = list(nltk.bigrams(text))
text.collocations()

In [None]:
#Get all the words begining with "h"

h_words = [w for w in set(text) if w.startswith("h")]
h_words.sort()

#Print the first 5 words
h_words[:5]

In [None]:
text.index("Snoop")

#Find the sentences where the word "Snoop" appears
snoop_sentences = rapper_corpus.sents()
snoop_sentences = [s for s in snoop_sentences if "Snoop" in s]
snoop_sentences[:5]

In [None]:
#Print the five longest uppercased words

upper_words = [w for w in set(text) if w.isupper()]
upper_words.sort(key=len, reverse=True)
upper_words[:5]

In [None]:
nb_tokens = len(text)
nb_distinct_tokens = len(set(text))
lexical_diversity = nb_distinct_tokens / nb_tokens

#Compute the lexical diversity for the west and east coast rappers
west_coast_rappers = df_rapper[df_rapper["Coast"] == "West"]
east_coast_rappers = df_rapper[df_rapper["Coast"] == "East"]

west_coast_rappers = west_coast_rappers["WikipediaPageName"].tolist()
east_coast_rappers = east_coast_rappers["WikipediaPageName"].tolist()

#We need to add ".txt" to the rapper's name to match the file name
west_coast_rappers = [rapper + ".txt" for rapper in west_coast_rappers]
east_coast_rappers = [rapper + ".txt" for rapper in east_coast_rappers]

west_coast_rappers_corpus = nltk.corpus.PlaintextCorpusReader(corpus_root, west_coast_rappers)
east_coast_rappers_corpus = nltk.corpus.PlaintextCorpusReader(corpus_root, east_coast_rappers)

west_coast_rappers_text = nltk.Text(west_coast_rappers_corpus.words())
east_coast_rappers_text = nltk.Text(east_coast_rappers_corpus.words())

west_coast_rappers_lexical_diversity = len(set(west_coast_rappers_text)) / len(west_coast_rappers_text)
east_coast_rappers_lexical_diversity = len(set(east_coast_rappers_text)) / len(east_coast_rappers_text)

print(f"West coast rappers lexical diversity: {west_coast_rappers_lexical_diversity}")
print(f"East coast rappers lexical diversity: {east_coast_rappers_lexical_diversity}")

In [None]:
# Create a frequency distribution of the words in the text
fdist = nltk.FreqDist(text)

# Print the 75 most common words
fdist.most_common(75)

In [None]:
# Find all the four letter words
four_letter_words = [w for w in set(text) if len(w) == 4]

# Sort them by decreasing frequency
four_letter_words.sort(key=fdist.get, reverse=True)
four_letter_words

In [None]:
total = sum(len(w) for w in text)
print(total)

# Compute the average word length
avg_word_length = total / len(text)
print(avg_word_length)

In [None]:
def vocab_size(text):
    return len(set(text))

def percent(word, text):
    return 100 * text.count(word) / len(text)

print(vocab_size(text))
print(percent("Dre", text))

## Part 3: Zipf's Law