# N-gram Analysis

## 1.0 Set Up

In [0]:
# Import libraries
import pandas as pd
import numpy as np
from urllib.parse import urlparse
import re

import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.feature_extraction.text import CountVectorizer, ENGLISH_STOP_WORDS

In [0]:
# Load data
df = spark.table("census_bureau_capstone.gold.census_repackaged_enriched")
df = df.toPandas() # Convert to pandas dataframe

display(df)

****

## 2.0 Exploratory Analysis

### 2.1 Repackaged vs Cited

In [0]:
# Get counts of repackaged vs cited
df.groupby('label').size()

### 2.2 URL Analysis

In [0]:
# Extract root URL and remove 'www.' if present
df['root_url'] = df['Target-URI'].apply(
    lambda x: urlparse(x).netloc.replace('www.', '')
)

# Extract top-level domain
df['tld'] = df['root_url'].apply(
    lambda x: re.search(r'\.[a-zA-Z]{2,}$', x).group(0) if re.search(r'\.[a-zA-Z]{2,}$', x) else None
)

display(df)

In [0]:
# Get repackaged top-level domains
df_tld = df[df['label'] == 'repackages'].groupby('tld').size().sort_values(ascending=False)

# Reset index
df_tld = df_tld.reset_index()

# Rename the columns
df_tld = df_tld.rename(columns={'tld': 'Top Level Domain', 0: 'Count'})

# Vizualize
sns.barplot(data = df_tld,
            x = 'Count',
            y = 'Top Level Domain')

plt.title('Top Level Domains')
plt.xlabel('Count')
plt.ylabel('Top Level Domain')
plt.show()



In [0]:
# Get repackaged urls domains
df_url = df[df['label'] == 'repackages'].groupby('root_url').size().sort_values(ascending=False)

# Reset index
df_url = df_url.reset_index()

# Rename the columns
df_url = df_url.rename(columns={'root_url': 'URL', 0: 'Count'})

# Vizualize
sns.barplot(data = df_url.head(10),
            x = 'Count',
            y = 'URL')

plt.title('Top 10 URLs')
plt.xlabel('Count')
plt.ylabel('URL')
plt.show()

In [0]:
# Review schooldigger.com URIs
display(df[df['root_url'] == 'schooldigger.com'])

In [0]:
# Review census.gov URIs
display(df[df['root_url'] == 'census.gov'])

In [0]:
# Review areavibes.com URIs
display(df[df['root_url'] == 'areavibes.com'])

****

## 3.0 N-gram Analysis

In [0]:
# Remove census.gov from analysis
df = df[df['root_url'] != 'census.gov']

In [0]:
# Define function to get top n-grams
def top_ngrams(dataframe, text_column, ngram_range=(1,1), additional_stop_words = [], top_n=10):
    """
    Four arguments:
    1. dataframe - input DataFrame
    2. text_column - name of text column  
    3. ngram_range - tuple (min, max) for n-gram range
    4. additional_stop_words - list of any stop words that need to added to the standard stop words list.
    5. top_n - number of top results
    """
    # Extract text from dataframe column
    text_data = dataframe[text_column].tolist()

    # Apply Contraction Expansion
    #text_data = [contractions.fix(text) for text in text_data]

    # Stop words
    combined_stop_words = list(ENGLISH_STOP_WORDS.union(set(additional_stop_words)))
    
    # Instantiate vectorizer
    vectorizer = CountVectorizer(
        ngram_range = ngram_range,
        stop_words = combined_stop_words,
    )
    
    # Vectorize, convert to dense, sum frequencies, sort
    sparse_matrix = vectorizer.fit_transform(text_data)
    dense_matrix = sparse_matrix.toarray()

    # Get feature names (n-grams)
    feature_names = vectorizer.get_feature_names_out()
   
    # Sum the frequencies of the terms across all documents
    term_frequencies = np.sum(dense_matrix, axis=0)

    # Create dataframe with terms and frequencies
    df_ngrams = pd.DataFrame({
        'term': feature_names,
        'frequency': term_frequencies
    })

    # Sort the results by the count (frequency) in descending order
    df_ngrams = df_ngrams.sort_values('frequency', ascending=False)

    # Keep only the top n number of results (n is 10 by default but can be adjusted)
    df_ngrams = df_ngrams.head(top_n)
   
    return df_ngrams

In [0]:
# Define function for n-grams visualization
def ngrams_viz(dataframe, text_column, ngram_range=(1,1), additional_stop_words = [], top_n=10):
    """
    Helpful function for visualizing positive vs negative n-grams.
    Uses top_ngrams() to get the data.

    Four arguments:
    1. dataframe - input DataFrame
    2. text_column - name of text column  
    3. ngram_range - tuple (min, max) for n-gram range
    4. additional_stop_words - list of any stop words that need to added to the standard stop words list.
    5. top_n - number of top results
    """

    # Get cited
    df_cited = top_ngrams(dataframe[dataframe['label'] == 'cites'], text_column, ngram_range, additional_stop_words, top_n)

    # Get repackaged
    df_repackaged = top_ngrams(dataframe[dataframe['label'] == 'repackages'], text_column, ngram_range, additional_stop_words, top_n)   

    # Create a figure and a set of subplots
    fig, axes = plt.subplots(nrows=1, ncols=2, figsize=(15, 5))

    # Visualize positive n-grams on the first subplot (axes[0])
    sns.barplot(x = "frequency", 
                y = "term", 
                data = df_cited,
                ax = axes[0])
    axes[0].set_title(f'Top {top_n} {ngram_range} N-Grams in Cited')
    axes[0].set_ylabel(f'N-Gram (Range: {ngram_range})')

    # Visualize mixed n-grams on the second subplot (axes[1])
    sns.barplot(x = "frequency", 
                y = "term", 
                data = df_repackaged,
                ax = axes[1])
    axes[1].set_title(f'Top {top_n} {ngram_range} N-Grams in Repackaged')
    axes[1].set_ylabel(f'N-Gram (Range: {ngram_range})')   

    # Display plot
    plt.tight_layout()
    plt.show()

In [0]:
# Get top n grams
ngrams_viz(
    dataframe = df, 
    text_column = 'text_norm', 
    ngram_range = (1,1), 
    additional_stop_words = [],
    top_n = 25)

In [0]:
# Define additional stop words
additional_stop_words = [
    'www',
    'http',
    'com',
    'org',
    'gov',
    'edu',
    '000',
    '00']

# Add numbers 0-999
for i in range(1000):
    additional_stop_words.append(str(i))

### 3.1 Unigrams

In [0]:
# Get top n grams
ngrams_viz(
    dataframe = df, 
    text_column = 'text_norm', 
    ngram_range = (1,1), 
    additional_stop_words = additional_stop_words,
    top_n = 25)

### 3.2 Bigrams

In [0]:
# Get top n grams
ngrams_viz(
    dataframe = df, 
    text_column = 'text_norm', 
    ngram_range = (2,2), 
    additional_stop_words = additional_stop_words,
    top_n = 25)

### 3.3 Trigrams

In [0]:
# Get top n grams
ngrams_viz(
    dataframe = df, 
    text_column = 'text_norm', 
    ngram_range = (3,3), 
    additional_stop_words = additional_stop_words,
    top_n = 25)

## 4.0 Save Repackaged Unigrams

In [0]:
# Save top repackaged unigrams to dataframe
unigrams_repackaged = top_ngrams(
    dataframe = df[df['label'] == 'repackages'],
    text_column = 'text_norm',
    ngram_range = (1, 1),
    additional_stop_words = additional_stop_words,
    top_n = 100
)

# Convert the Pandas DataFrame to a PySpark DataFrame
unigrams_repackaged = spark.createDataFrame(unigrams_repackaged)

# Save to table in gold layer
unigrams_repackaged.write.saveAsTable('census_bureau_capstone.gold.unigrams_repackaged')