# Imports

### This notebook is just eda fro unigram and bigram analysis

In [1]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB

In [4]:
df = pd.read_csv('US-president-speeches-with-metadata-test.csv')
def todatetime(dataframe : pd.DataFrame, datecols : list[str] = ['Date']) -> pd.DataFrame:
    '''
    Function to convert all the different date columns columns into pandas datetime format
    '''
    dataframe1 = dataframe.copy()
    for cols in datecols:
        dataframe1[cols] = pd.to_datetime(dataframe1[cols])
    return dataframe1

def attaching_periods(dataframe : pd.DataFrame, timecol : str = 'Date') -> pd.DataFrame :
    """
    Attching time periods to the speeches according to the Mili centre
    * `Early Republic (1789–1829)`
    * `Jacksonian Democracy(1829–1853)`
    * `Sectional Conflict (1853–1881)`
    * `Gilded Age (1881–1897)`
    * `Progressive Era (1897–1921)`
    * `Depression and World Conflict(1921–1961)`
    * `Social Change and Soviet Relations (1961–1989)`
    * `Globalization(1989–)`
    """
    dataframe1 = dataframe.copy()
    dataframe1['Year'] = dataframe1['Date'].dt.year
    time_periods = []
    for year in dataframe1['Year']:
        if year <= 1829 and year >= 1789:
            time_periods.append('Early Republic')
        elif year <= 1853 and year >= 1829:
            time_periods.append('Jacksonian Democracy')
        elif year <= 1881 and year >= 1853:
            time_periods.append('Sectional Conflict')
        elif year <= 1897 and year >= 1881:
            time_periods.append('Gilded Age')
        elif year <= 1921 and year >= 1897:
            time_periods.append('Progressive Era')
        elif year <= 1961 and year >= 1921:
            time_periods.append('Depression and World Conflict')
        elif year <= 1989 and year >= 1961:
            time_periods.append('Social Change and Soviet Relations')
        elif year >= 1989:
            time_periods.append('Globalization')
    
    dataframe1['Time_Periods'] = time_periods
    return dataframe1
df = todatetime(df)
df = attaching_periods(df)

### Calculating the conditional probablity for the total number of documents for each label

#### https://stats.stackexchange.com/questions/72068/document-classification-question?rq=1
#### https://community.alteryx.com/t5/Data-Science/Naive-Bayes-in-Python/ba-p/138424

In [75]:
# https://stats.stackexchange.com/questions/72068/document-classification-question?rq=1
# https://community.alteryx.com/t5/Data-Science/Naive-Bayes-in-Python/ba-p/138424
# Both of these links used to adapt the two funcitons below

def conditional_probability(document_term_matrix:np.array=None, y_true:np.array=None, dictionary:list=None)->np.array:
    '''
    '''
    
    # unique labels
    unique_labels = np.unique(y_true)
    
    # storing conditional probabilities
    probs = {}
    
    # Calculating total number of documents for each label
    label_counts = {}
    
    for label in unique_labels:
        label_counts[label] = np.sum(y_true == label)
        
    # Calculating conditional probability for each word and label
    for i, word in enumerate(dictionary):
        probs[word] = {}
        
        for label in unique_labels:
            # Number of times the word appears in documents with the label
            word_count = np.sum((y_true == label) & (document_term_matrix[:,i] > 0))
            
            # Conditional probability
            probs[word][label] = word_count / label_counts[label]
            
    return probs

### Utilising link below to adapt code 
https://stackoverflow.com/questions/51160354/computing-top-n-word-pair-co-occurrences-from-document-term-matrix

In [76]:
def top_n_period_representative_words(probs:dict=None, n:int=25, labels:np.array=None)->dict:
    '''
    '''
    
    # Sort words by their conditional probability for each time period
    sorted_probs = {}
    unique_labels = np.unique(labels)
    
    for label in unique_labels:
        sorted_probs[label] = sorted(probs.items(), key=lambda x: x[1][label], reverse=True)

    # Select the top N words as the period-representative words
    representative_words = {}
    
    for label in unique_labels:
        representative_words[label] = [word for word, _ in sorted_probs[label][:n]]

    return representative_words

# Republican and Democratic Model

In [77]:
# Republican
rep = df[(df['Time_Periods']=='Globalization') & (df['Party']=='Republican')]

# Democratic
dem = df[(df['Time_Periods']=='Globalization') & (df['Party']=='Democratic')]

### Unigram Analysis

In [78]:
# Count Vectorizer
rep_cv = CountVectorizer(max_df=0.9, ngram_range=(1,1))
rep_dt_matrix = rep_cv.fit_transform(rep['Speech'])


dem_cv = CountVectorizer(max_df=0.9, ngram_range=(1,1))
dem_dt_matrix = dem_cv.fit_transform(dem['Speech'])

In [79]:
# republicans
rep_classifier = MultinomialNB()
rep_classifier.fit(rep_dt_matrix, rep['Time_Periods'])
rep_predictions = rep_classifier.predict(rep_dt_matrix)

# democratics
dem_classifier = MultinomialNB()
dem_classifier.fit(dem_dt_matrix, dem['Time_Periods'])
dem_predictions = dem_classifier.predict(dem_dt_matrix)

In [80]:
rep_probsy = conditional_probability(document_term_matrix=rep_dt_matrix.toarray(),
                                     y_true=rep['Time_Periods'].to_numpy(),
                                     dictionary=rep_cv.get_feature_names_out())

dem_probsy = conditional_probability(document_term_matrix=dem_dt_matrix.toarray(),
                                     y_true=dem['Time_Periods'].to_numpy(),
                                     dictionary=dem_cv.get_feature_names_out())

In [81]:
rep_res = top_n_period_representative_words(rep_probsy, 30, labels=rep['Time_Periods'].to_numpy())
dem_res = top_n_period_representative_words(dem_probsy, 30, labels=dem['Time_Periods'].to_numpy())

# republicans left column, democratics right column
rep_dem = pd.concat([pd.DataFrame(rep_res), pd.DataFrame(dem_res)], axis=1)
rep_dem.columns = ['Republican_Unigrams', 'Democrats_Unigrams']
rep_dem

Unnamed: 0,Republican_Unigrams,Democrats_Unigrams
0,great,first
1,nation,let
2,other,long
3,states,nation
4,first,states
5,make,thank
6,no,together
7,them,well
8,what,were
9,when,your


In [82]:
rep_res = top_n_period_representative_words(rep_probsy, 30, labels=rep['Time_Periods'].to_numpy())
dem_res = top_n_period_representative_words(dem_probsy, 30, labels=dem['Time_Periods'].to_numpy())

# republicans left column, democratics right column
rep_dem = pd.concat([pd.DataFrame(rep_res), pd.DataFrame(dem_res)], axis=1)
rep_dem.columns = ['Republican_Unigrams', 'Democrats_Unigrams']
rep_dem

Unnamed: 0,Republican_Unigrams,Democrats_Unigrams
0,great,first
1,nation,let
2,other,long
3,states,nation
4,first,states
5,make,thank
6,no,together
7,them,well
8,what,were
9,when,your


### Bigram Analysis

In [83]:
# Count Vectorizer
rep_cv = CountVectorizer(max_df=0.9, ngram_range=(2,2))
rep_dt_matrix = rep_cv.fit_transform(rep['Speech'])


dem_cv = CountVectorizer(max_df=0.9, ngram_range=(2,2))
dem_dt_matrix = dem_cv.fit_transform(dem['Speech'])

### Running Classifier

In [84]:
# republicans
rep_classifier = MultinomialNB()
rep_classifier.fit(rep_dt_matrix, rep['Time_Periods'])
rep_predictions = rep_classifier.predict(rep_dt_matrix)

# democratics
dem_classifier = MultinomialNB()
dem_classifier.fit(dem_dt_matrix, dem['Time_Periods'])
dem_predictions = dem_classifier.predict(dem_dt_matrix)

In [85]:
rep_probsy = conditional_probability(document_term_matrix=rep_dt_matrix.toarray(),
                                     y_true=rep['Time_Periods'].to_numpy(),
                                     dictionary=rep_cv.get_feature_names_out())

dem_probsy = conditional_probability(document_term_matrix=dem_dt_matrix.toarray(),
                                     y_true=dem['Time_Periods'].to_numpy(),
                                     dictionary=dem_cv.get_feature_names_out())

In [86]:
rep_res = top_n_period_representative_words(rep_probsy, 30, labels=rep['Time_Periods'].to_numpy())
dem_res = top_n_period_representative_words(dem_probsy, 30, labels=dem['Time_Periods'].to_numpy())

# republicans left column, democratics right column
rep_dem = pd.concat([pd.DataFrame(rep_res), pd.DataFrame(dem_res)], axis=1)
rep_dem.columns = ['Republican_Unigrams', 'Democrats_Unigrams']
rep_dem

Unnamed: 0,Republican_Unigrams,Democrats_Unigrams
0,of our,we can
1,thank you,it is
2,united states,thank you
3,we will,that the
4,the united,to do
5,it is,from the
6,with the,the world
7,will be,the united
8,and to,united states
9,that we,will be


# Throughout Time Model

In [87]:
# Count Vectorizer
vect = CountVectorizer(min_df=1, max_df=0.9, ngram_range=(1,1))  
dt_matrix = vect.fit_transform(df['Speech'])

In [88]:
# Create a MultinomialNB object
classifier = MultinomialNB()
# Train the classifier using the document-term matrix and labels
classifier.fit(dt_matrix, df['Time_Periods'])
# Use the classifier to predict the time period for each document
predictions = classifier.predict(dt_matrix)

In [89]:
probsy = conditional_probability(document_term_matrix=dt_matrix.toarray(), 
                                 y_true=df['Time_Periods'].to_numpy(),
                                 dictionary=vect.get_feature_names_out()
                                )

In [90]:
data = top_n_period_representative_words(probsy, 25, labels=df['Time_Periods'].to_numpy())

In [91]:
pd.DataFrame(data)

Unnamed: 0,Depression and World Conflict,Early Republic,Gilded Age,Globalization,Jacksonian Democracy,Progressive Era,Sectional Conflict,Social Change and Soviet Relations
0,we,them,such,we,united,if,united,view
1,people,united,upon,you,government,its,such,we
2,they,may,may,people,may,do,government,you
3,other,other,now,view,made,no,may,can
4,one,they,government,us,such,we,now,who
5,there,those,any,who,upon,may,any,people
6,great,such,should,these,should,who,upon,now
7,no,these,united,america,would,government,other,american
8,these,government,can,can,any,now,under,no
9,its,under,do,they,state,one,no,they
