In [None]:
import os
import re
import pandas as pd
from collections import Counter

In [None]:
df = pd.read_csv('final_extracted_data.csv',index_col=0)

In [None]:
df.drop(columns=['Fwd','Sign-off Name','Name','Raw Content','Non English','Email','Content'],inplace=True)

### Name Mapping

In [None]:
# Male Mapping
# Step 1: Extract unique names that match the condition
unique_names = df[df['Gender'] == 'M']['Standardised Name'].unique()

# Step 2: Create a mapping dictionary
male_name_mapping = {name: f'm{i+1}' for i, name in enumerate(unique_names)}

# Step 3: Apply the mapping
df['Mapped Name'] = df['Standardised Name'].map(male_name_mapping)

# Display the updated DataFrame
display(df)

In [None]:
# Female Mapping
# Step 1: Extract unique names that match the condition
unique_female_names = df[df['Gender'] == 'F']['Standardised Name'].unique()

# Step 2: Create a mapping dictionary
female_name_mapping = {name: f'f{i+1}' for i, name in enumerate(unique_female_names)}

# Step 3: Apply the mapping
# Create a temporary series for female names mapping
temp_female_mapped = df['Standardised Name'].map(female_name_mapping)

# Use combine_first to fill NaN values in the 'Mapped Name' with the female mapping
df['Mapped Name'] = df['Mapped Name'].combine_first(temp_female_mapped)

# Display the updated DataFrame
display(df)

In [None]:
import pickle

# Save the dictionary to a file
with open('male_name_mapping.pkl', 'wb') as file:
    pickle.dump(male_name_mapping, file)

# Save the dictionary to a file
with open('female_name_mapping.pkl', 'wb') as file:
    pickle.dump(female_name_mapping, file)

In [None]:
df[df['Gender'] == 'F']['Standardised Name'].unique()
df[df['Gender'] == 'F']

In [None]:
df.drop(columns=['First Name','Standardised Name'],inplace=True)

In [None]:
rows_to_drop = df[df['Cleaned Content'].isna()]
display(rows_to_drop)
df.drop(index=rows_to_drop.index,inplace=True)

In [None]:
rows_to_drop = df[df['Gender'] == 'unknown']
df.drop(index=rows_to_drop.index,inplace=True)

In [None]:
df['Gender'].value_counts()

In [None]:
male_authors = df[df['Gender'] == 'M']['Mapped Name'].value_counts()
print('Number of Unique male authors:', len(male_authors))


male_authors_df = pd.DataFrame(male_authors)
male_authors_df.reset_index(inplace=True)
male_authors_df.columns = ['Mapped Name','Email Count']
male_authors_df

female_authors = df[df['Gender'] == 'F']['Mapped Name'].value_counts()
print('Number of Unique female authors:', len(female_authors))

female_authors_df = pd.DataFrame(female_authors)
female_authors_df.reset_index(inplace=True)
female_authors_df.columns = ['Mapped Name','Email Count']
female_authors_df

In [None]:
male_authors.describe()

In [None]:
female_authors.describe()

In [None]:
mixed_authors = df['Mapped Name'].value_counts()
mixed_author = pd.DataFrame(mixed_authors)

mixed_author.reset_index(inplace=True)
mixed_author.columns = ['Mapped Name', 'Email Count']

# Step 3: Drop duplicates in the original DataFrame to retain unique authors with their genders
unique_authors = df[['Mapped Name', 'Gender']].drop_duplicates()

# Step 4: Merge the new DataFrame with the unique authors DataFrame to retain the gender information
mixed_authors_df = pd.merge(mixed_author, unique_authors, on='Mapped Name', how='left')

# Display the new DataFrame
mixed_authors_df

### EDA of the distributions

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd
import numpy as np

# Calculate mean and median
mean_male = male_authors.mean()
median_male = male_authors.median()
mean_female = female_authors.mean()
median_female = female_authors.median()

# Bar plot with annotations
plt.figure(figsize=(8, 6))
sns.barplot(x='Gender', y='Email Count', data=mixed_authors_df, estimator=np.mean, ci=None)
plt.title('Mean Number of Emails per Author by Gender')
plt.ylabel('Mean Number of Emails')
plt.text(0, mean_male + 2, f'Median: {median_male:.1f}', ha='center', va='bottom', color='blue')
plt.text(1, mean_female + 2, f'Median: {median_female:.1f}', ha='center', va='bottom', color='blue')
plt.show()

# Box plot with mean marker
plt.figure(figsize=(8, 6))
sns.boxplot(x='Gender', y='Email Count', data=mixed_authors_df)
sns.stripplot(x='Gender', y='Email Count', data=mixed_authors_df, color='orange', jitter=True, marker='o', alpha=0.5)
plt.title('Distribution of Emails per Author by Gender')
plt.ylabel('Number of Emails')
plt.axhline(mean_male, color='blue', linestyle='--', label=f'Mean Male: {mean_male:.1f}')
plt.axhline(mean_female, color='red', linestyle='--', label=f'Mean Female: {mean_female:.1f}')
plt.legend()
plt.show()

# Table of summary statistics
summary_stats = pd.DataFrame({
    'Statistic': ['Mean', 'Median'],
    'Male': [mean_male, median_male],
    'Female': [mean_female, median_female]
})
print(summary_stats)

In [None]:
# Filter top 10 authors for each gender
top10_male = male_authors_df.nlargest(10, 'Email Count')
top10_female = female_authors_df.nlargest(10, 'Email Count')

# Create subplots
fig, axes = plt.subplots(1, 2, figsize=(14, 6), sharey=True)

# Plot for male authors
sns.barplot(ax=axes[0], x='Mapped Name', y='Email Count', data=top10_male, palette='Blues_d')
axes[0].set_title('Top 10 Male Authors by Email Count')
axes[0].set_xlabel('Email Count')
axes[0].set_ylabel('Author')

# Plot for female authors
sns.barplot(ax=axes[1], x='Mapped Name', y='Email Count', data=top10_female, palette='Reds_d')
axes[1].set_title('Top 10 Female Authors by Email Count')
axes[1].set_xlabel('Email Count')
axes[1].set_ylabel('')

plt.tight_layout()
plt.show()

In [None]:
import pandas as pd

# Separate data by gender
male_df = male_authors_df
female_df = female_authors_df

# Calculate the threshold for the top 10%
male_threshold = male_df['Email Count'].quantile(0.90)
female_threshold = female_df['Email Count'].quantile(0.90)

# Identify the top 10% authors
top_10_male = male_df[male_df['Email Count'] >= male_threshold]
top_10_female = female_df[female_df['Email Count'] >= female_threshold]

# Sum the email contributions of the top 25% authors
top_10_male_contribution = top_10_male['Email Count'].sum()
top_10_female_contribution = top_10_female['Email Count'].sum()

# Calculate the total email contributions for each gender
total_male_emails = male_df['Email Count'].sum()
total_female_emails = female_df['Email Count'].sum()

# Calculate the percentage contribution of the top 25%
top_10_male_percentage = (top_10_male_contribution / total_male_emails) * 100
top_10_female_percentage = (top_10_female_contribution / total_female_emails) * 100

# Display the results
print(f"Top 10% Male Authors Contribution: {top_10_male_contribution} emails ({top_10_male_percentage:.2f}% of total male emails)")
print(f"Top 10% Female Authors Contribution: {top_10_female_contribution} emails ({top_10_female_percentage:.2f}% of total female emails)")

### Sampling Method
Random sampling with a cap on each author

In [None]:
df_male = df[df['Gender'] == 'M']
print('Original number of posts by male:', len(df_male))

# Assuming df_male is your dataframe with contributions from male authors and has columns 'author' and 'text'
original_size = len(df_male)
desired_sample_size = 26500  # Define your desired sample size from male authors
initial_cap = 300  # Initial cap on contributions per author

# Step 1: Shuffle the entire dataset of male authors to ensure randomness
df_male_shuffled = df_male.sample(frac=1, random_state=1).reset_index(drop=True)

# Step 2: Initialize an empty list to collect the final samples
final_male_samples = []

# Step 3: Create a dictionary to keep track of the number of contributions per male author in the sample
male_author_counts = {}

# Step 4: Iterate over the shuffled dataset and add to final_male_samples while respecting the cap
for index, row in df_male_shuffled.iterrows():
    author = row['Mapped Name']
    if male_author_counts.get(author, 0) < initial_cap:
        final_male_samples.append(row)
        male_author_counts[author] = male_author_counts.get(author, 0) + 1
    if len(final_male_samples) >= desired_sample_size:
        break

df_male_samples = pd.DataFrame(final_male_samples)

# Step 5: Create the hold-out validation set from the remaining data
df_male_remaining = df_male_shuffled.drop(df_male_samples.index)

# Check the final sample sizes
final_sample_size = len(df_male_samples)
holdout_size = len(df_male_remaining)

print(f"Final sample size: {final_sample_size}")
print(f"Hold-out validation set size: {holdout_size}")
df_male_samples['Mapped Name'].value_counts()[:30]

print('\nCap per male author:', initial_cap)
print('Author maximum post cap:', initial_cap / desired_sample_size * 100)

print('\nBasic Statistics:')
print(df_male_samples['Mapped Name'].value_counts().describe())
print('\nMedian:', df_male_samples['Mapped Name'].value_counts().median())

In [None]:
df_female = df[df['Gender'] == 'F']
print('Original number of posts by female:', len(df_female))

# Assuming df_male is your dataframe with contributions from male authors and has columns 'author' and 'text'
original_size = len(df_female)
desired_sample_size = 750  # Define your desired sample size from male authors
initial_cap = 40  # Initial cap on contributions per author

# Step 1: Shuffle the entire dataset of male authors to ensure randomness
df_female_shuffled = df_female.sample(frac=1, random_state=1).reset_index(drop=True)

# Step 2: Initialize an empty list to collect the final samples
final_female_samples = []

# Step 3: Create a dictionary to keep track of the number of contributions per male author in the sample
female_author_counts = {}

# Step 4: Iterate over the shuffled dataset and add to final_male_samples while respecting the cap
for index, row in df_female_shuffled.iterrows():
    author = row['Mapped Name']
    if female_author_counts.get(author, 0) < initial_cap:
        final_female_samples.append(row)
        female_author_counts[author] = female_author_counts.get(author, 0) + 1
    if len(final_female_samples) >= desired_sample_size:
        break

# Check the final sample size
final_sample_size = len(final_female_samples)

df_female_samples = pd.DataFrame(final_female_samples)

# Step 5: Create the hold-out validation set from the remaining data
df_female_remaining = df_female_shuffled.drop(df_female_samples.index)

# Check the final sample sizes
final_sample_size = len(df_female_samples)
holdout_size = len(df_female_remaining)

print(f"Final sample size: {final_sample_size}")
print(f"Hold-out validation set size: {holdout_size}")

print('\nCap per female author:', initial_cap)
print('Author maximum post cap:', initial_cap / desired_sample_size * 100)

print('\nBasic Statistics:')
print(df_female_samples['Mapped Name'].value_counts().describe())
print('\nMedian:', df_female_samples['Mapped Name'].value_counts().median())

In [None]:
df_samples = pd.concat([df_male_samples, df_female_samples], axis=0)
df_validation = pd.concat([df_male_remaining, df_female_remaining], axis=0)

In [None]:
df_samples.drop_duplicates(subset='Mapped Name')['Gender'].value_counts()

In [None]:
df_samples_count = df_samples['Mapped Name'].value_counts()

df_samples_count = pd.DataFrame(df_samples_count)
df_samples_count.reset_index(inplace=True)
df_samples_count.columns = ['Mapped Name','Email Count']

# Step 3: Drop duplicates in the original DataFrame to retain unique authors with their genders
unique_authors = df[['Mapped Name', 'Gender']].drop_duplicates()

# Step 4: Merge the new DataFrame with the unique authors DataFrame to retain the gender information
df_samples_count = pd.merge(df_samples_count, unique_authors, on='Mapped Name', how='left')


In [None]:
sampled_mean_male = df_male_samples['Mapped Name'].value_counts().mean()
sampled_mean_female = df_female_samples['Mapped Name'].value_counts().mean()


# Box plot with mean marker
plt.figure(figsize=(10, 6))
sns.boxplot(x='Gender', y='Email Count', data=df_samples_count)
sns.stripplot(x='Gender', y='Email Count', data=df_samples_count, color='orange', jitter=True, marker='o', alpha=0.5)
plt.title('Distribution of Emails per Author by Gender')
plt.ylabel('Number of Emails')
plt.axhline(sampled_mean_male, color='blue', linestyle='--', label=f'Mean Male: {sampled_mean_male:.1f}')
plt.axhline(sampled_mean_female, color='red', linestyle='--', label=f'Mean Female: {sampled_mean_female:.1f}')
plt.legend()
plt.show()

### Define features and their functions

#### F1: Word Count - F4: Vodabulary Richness

In [None]:
# Lexcial Features F1-F4
# Function to count words
def word_count(text):
    if pd.isnull(text):
        return 0
    # Define a regular expression pattern that matches words
    # This pattern will match sequences of alphanumeric characters and apostrophes (e.g., "don't")
    words = re.findall(r"\b\w+(?:[-']\w+)*\b", text)
    return len(words)

# Function to count characters
def character_count(text):
    if pd.isnull(text):
        return 0
    return len(text)

# Function to calculate average word length
def average_word_length(text):
    if pd.isnull(text) or len(text.split()) == 0:
        return 0
    words = re.findall(r"\b\w+(?:[-']\w+)*\b", text)
    # Check if words is not empty to avoid division by zero
    if len(words) == 0:
        return 0
    return sum(len(word) for word in words) / len(words)

# Yule's K
def vocabulary_richness(text):
    
    # Tokenize the text into words using a regular expression
    words = re.findall(r"\b\w+(?:[-']\w+)*\b", text.lower())
    
    # Total number of words
    total_words = len(words)
    if total_words == 0:
        return 0
    
    # Calculate the frequency of each word
    word_freq = Counter(words)
    
    # Calculate Yule's K
    sum_fi_i_minus_1 = sum(freq * (freq - 1) for freq in word_freq.values())
    k_value = 10**4 * sum_fi_i_minus_1 / total_words**2
    
    return k_value

In [None]:
# Apply the functions to the Content column
df_samples['F1 Word Count'] = df_samples['Cleaned Content'].apply(word_count).astype(int)
df_samples['F2 Character Count'] = df_samples['Cleaned Content'].apply(character_count)
df_samples['F3 Average Word Length'] = df_samples['Cleaned Content'].apply(average_word_length)
df_samples['F4 Vocabulary Richness'] = df_samples['Cleaned Content'].apply(vocabulary_richness)

df_validation['F1 Word Count'] = df_validation['Cleaned Content'].apply(word_count).astype(int)
df_validation['F2 Character Count'] = df_validation['Cleaned Content'].apply(character_count)
df_validation['F3 Average Word Length'] = df_validation['Cleaned Content'].apply(average_word_length)
df_validation['F4 Vocabulary Richness'] = df_validation['Cleaned Content'].apply(vocabulary_richness)

#### F5: Average Sentence Length - F8: Punctuation

In [None]:
import pandas as pd
import re
import nltk
from nltk.tokenize import sent_tokenize, word_tokenize
from nltk import pos_tag
from collections import Counter

nltk.download('punkt')
nltk.download('averaged_perceptron_tagger')

# F5: Average Sentence Length
def average_sentence_length(text):
    if not isinstance(text, str):
        return 0
    sentences = sent_tokenize(text)
    if len(sentences) == 0:
        return 0
    total_words = sum(word_count(sentence) for sentence in sentences)
    return total_words / len(sentences)

# F6: Part-of-Speech (POS) Tags Distribution
def pos_tags_distribution(text):
    if not isinstance(text, str):
        return {}
    words = word_tokenize(text)
    pos_tags = pos_tag(words)

    exclude_tags = ['$',"''",'(',')',',','--','.',':','``','#']

    # Count POS tags, excluding specified tags
    pos_counts = Counter(tag for word, tag in pos_tags if tag not in exclude_tags)
    total_tags = sum(pos_counts.values())
    return {tag: count / total_tags for tag, count in pos_counts.items()}

# F7: Sentence Complexity (Number of Clauses per Sentence)
def sentence_complexity(text):
    if not isinstance(text, str):
        return 0
    sentences = sent_tokenize(text)
    if len(sentences) == 0:
        return 0
    clause_indicator_words = ['that', 'which', 'who', 'whom', 'whose', 'because', 'since', 'unless', 'although', 'if', 'when', 'while', 'until', 'before', 'after', 'as', 'though', 'even though']
    total_clauses = sum(sum(1 for word in word_tokenize(sentence) if word.lower() in clause_indicator_words) for sentence in sentences)
    return total_clauses / len(sentences)

# F8: Punctuation Usage Counts
def punctuation_usage(text):
    if not isinstance(text, str):
        return {}
    
    # Define the punctuation marks to consider
    punctuation_marks = ['.', ',', '!', '?', ';', ':', "'"]
    
    # Count the punctuation marks in the text
    punctuation_counts = Counter(char for char in text if char in punctuation_marks)
    
    # Calculate the total number of punctuation marks
    total_punctuation = sum(punctuation_counts.values())
    
    # Calculate the distribution by dividing the count of each mark by the total
    punctuation_distribution = {mark: count / total_punctuation for mark, count in punctuation_counts.items()} if total_punctuation > 0 else {}
    
    return punctuation_distribution

In [None]:
# Apply the functions to the Content column
df_samples['F5 Sentence Length'] = df_samples['Cleaned Content'].apply(average_sentence_length)
df_samples['F6 POS Tags Distribution'] = df_samples['Cleaned Content'].apply(pos_tags_distribution)
df_samples['F42 Sentence Complexity'] = df_samples['Cleaned Content'].apply(sentence_complexity)
df_samples['F43 Punctuation Usage'] = df_samples['Cleaned Content'].apply(punctuation_usage)

df_validation['F5 Sentence Length'] = df_validation['Cleaned Content'].apply(average_sentence_length)
df_validation['F6 POS Tags Distribution'] = df_validation['Cleaned Content'].apply(pos_tags_distribution)
df_validation['F42 Sentence Complexity'] = df_validation['Cleaned Content'].apply(sentence_complexity)
df_validation['F43 Punctuation Usage'] = df_validation['Cleaned Content'].apply(punctuation_usage)

#### F50: Readability - F52 Politeness Markers

In [None]:
import pandas as pd
import textstat

# F9: Define the function to calculate Flesch Reading Ease Score
def flesch_reading_ease(text):
    if not isinstance(text, str):
        return None
    return textstat.flesch_reading_ease(text)

import pandas as pd
import nltk
from nltk.tokenize import word_tokenize
from nltk import pos_tag
from collections import Counter

nltk.download('punkt')
nltk.download('averaged_perceptron_tagger')

# F10: Define the function to calculate formality score
def formality_score(text):
    if not isinstance(text, str):
        return None
    
    # Tokenize and POS tag the text
    words = word_tokenize(text)
    pos_tags = pos_tag(words)
    
    # Count POS tags
    pos_counts = Counter(tag for word, tag in pos_tags)
    
    # Define counts for formal and informal categories
    formal = pos_counts['NN'] + pos_counts['NNS'] + pos_counts['NNP'] + pos_counts['NNPS'] + pos_counts['JJ'] + pos_counts['JJR'] + pos_counts['JJS'] + pos_counts['IN'] + pos_counts['DT']
    informal = pos_counts['PRP'] + pos_counts['PRP$'] + pos_counts['VB'] + pos_counts['VBD'] + pos_counts['VBG'] + pos_counts['VBN'] + pos_counts['VBP'] + pos_counts['VBZ'] + pos_counts['RB'] + pos_counts['RBR'] + pos_counts['RBS'] + pos_counts['UH']
    
    # Calculate formality score
    f_score = ((formal - informal) + 100) / 2
    return f_score

# Define polite words and phrases
polite_words = set([
    'please', 'thank you', 'ta', 'thanks', 'sorry', 'excuse me', 'would you mind', 'could you',
    'would you be so kind', 'I would appreciate', 'if it\'s not too much trouble',
    'thanks', 'cheers', 'much appreciated', 'kindly', 'pardon', 'beg your pardon',
    'if you don\'t mind', 'grateful', 'obliged', 'do me a favor', 'be so kind',
    'please do', 'be kind enough', 'thankful', 'respectfully', 'with respect',
    'if I may', 'if I might', 'if it pleases you', 'would it trouble you'
])

# Define impolite words and phrases
impolite_words = set([
    'stupid', 'idiot', 'dumb', 'shut up', 'shut your mouth', 'suck', 'jerk',
    'moron', 'fool', 'loser', 'annoying', 'pathetic', 'horrible', 'dick', 'bitch',
    'asshole', 'screw you', 'piss off', 'damn', 'crap', 'bastard', 'hell', 'shit',
    'fuck', 'fucking', 'freak', 'douche', 'scum', 'trash', 'worthless'
])

# F10: Define the function to calculate politeness degree
def politeness_degree(text):
    if not isinstance(text, str):
        return None
    
    # Tokenize the text
    words = word_tokenize(text.lower())
    
    # Count polite and impolite words
    polite_count = sum(1 for word in words if word in polite_words)
    impolite_count = sum(1 for word in words if word in impolite_words)
    
    # Calculate total words
    total_words = len(words)
    if total_words == 0:
        return 0
    
    # Calculate politeness score
    politeness_score = (polite_count - impolite_count) / total_words
    return politeness_score

In [None]:
# Apply the function to the Content column
df_samples['F50 Readability Score'] = df_samples['Cleaned Content'].apply(flesch_reading_ease)
df_samples['F51 Formality Score'] = df_samples['Cleaned Content'].apply(formality_score)
df_samples['F52 Politeness Degree'] = df_samples['Cleaned Content'].apply(politeness_degree)

df_validation['F50 Readability Score'] = df_validation['Cleaned Content'].apply(flesch_reading_ease)
df_validation['F51 Formality Score'] = df_validation['Cleaned Content'].apply(formality_score)
df_validation['F52 Politeness Degree'] = df_validation['Cleaned Content'].apply(politeness_degree)

### Cleaning and Normalisation

In [None]:
import pandas as pd
import contractions
from textblob import TextBlob

# Define common abbreviations
abbreviations = {
    "u": "you",
    "r": "are",
    "ur": "your",
    "lol": "laughing out loud",
    "omg": "oh my god",
    "idk": "I don't know",
    "btw": "by the way",
    "brb": "be right back",
    "gtg": "got to go",
    "ttyl": "talk to you later",
    "np": "no problem",
    "thx": "thanks",
    "plz": "please",
    "w/": "with",
    "w/o": "without",
}

# Function to expand contractions
def expand_contractions(text):
    return contractions.fix(text)

# Function to convert abbreviations
def convert_abbreviations(text):
    words = text.split()
    converted_words = [abbreviations.get(word.lower(), word) for word in words]
    return ' '.join(converted_words)

# Main function to clean and normalize text
def clean_normalize_text(text):
    if not isinstance(text, str):
        return None
    text = expand_contractions(text)
    text = convert_abbreviations(text)
    # text = correct_spelling(text)
    return text

In [None]:
import pandas as pd
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
import string

nltk.download('punkt')
nltk.download('stopwords')

# Define stop words and punctuation
stop_words = set(stopwords.words('english'))
negation_words = {'not', 'no', 'never', 'none', 'nobody', 'nothing', 'neither', 'nowhere', 'hardly', 'scarcely', 'barely'}
stop_words = stop_words - negation_words
punctuation = set(string.punctuation)

# Function to remove stop words, punctuation, and special characters
def remove_stopwords_punctuation(text):
    if not isinstance(text, str):
        return None
    
    text = text.lower()
    
    # Tokenize the text
    words = word_tokenize(text)
    
    # Remove stop words, punctuation, non-alphabetic tokens, and single alphabet characters
    filtered_words = [word for word in words if word.lower() not in stop_words and word not in punctuation and word.isalpha() and len(word) > 1]
    
    return filtered_words

In [None]:
from nltk.stem import WordNetLemmatizer
from nltk.corpus import wordnet

#  Initialize the WordNetLemmatizer
lemmatizer = WordNetLemmatizer()

def get_wordnet_pos(word):
    """Map POS tag to first character lemmatize() accepts"""
    tag = nltk.pos_tag([word])[0][1][0].upper()
    tag_dict = {"J": wordnet.ADJ,
                "N": wordnet.NOUN,
                "V": wordnet.VERB,
                "R": wordnet.ADV}
    return tag_dict.get(tag, wordnet.NOUN)

# Function to lemmatize tokens
def lemmatize_tokens(tokens):
    return [lemmatizer.lemmatize(word, get_wordnet_pos(word)) for word in tokens]


In [None]:
# Apply the cleaning and normalization function to the Content column
df_samples['Normalized Content'] = df_samples['Cleaned Content'].apply(clean_normalize_text)
# Apply the tokenization and removal function to the Normalized Content column
df_samples['Tokenized Content'] = df_samples['Normalized Content'].apply(remove_stopwords_punctuation)
# Apply the lemmatization function to the Tokenized Content column
df_samples['Lemmatized Content'] = df_samples['Tokenized Content'].apply(lemmatize_tokens)

# Apply the cleaning and normalization function to the Content column
df_validation['Normalized Content'] = df_validation['Cleaned Content'].apply(clean_normalize_text)
# Apply the tokenization and removal function to the Normalized Content column
df_validation['Tokenized Content'] = df_validation['Normalized Content'].apply(remove_stopwords_punctuation)
# Apply the lemmatization function to the Tokenized Content column
df_validation['Lemmatized Content'] = df_validation['Tokenized Content'].apply(lemmatize_tokens)

In [None]:
# F12 Function to get sentiment polarity
def get_sentiment_polarity(tokens):
    if tokens is None:
        return None
    text = ' '.join(tokens)
    return TextBlob(text).sentiment.polarity

# F13 Function to get subjectivity
def get_subjectivity(tokens):
    if tokens is None:
        return None
    text = ' '.join(tokens)
    return TextBlob(text).sentiment.subjectivity

In [None]:
df_samples['F53 Sentiment Polarity'] = df_samples['Lemmatized Content'].apply(get_sentiment_polarity)
df_samples['F54 Subjectivity'] = df_samples['Lemmatized Content'].apply(get_subjectivity)

df_validation['F53 Sentiment Polarity'] = df_validation['Lemmatized Content'].apply(get_sentiment_polarity)
df_validation['F54 Subjectivity'] = df_validation['Lemmatized Content'].apply(get_subjectivity)

### LDA Topic Modelling

In [None]:
from nltk.corpus import stopwords

# Step 1: Define or load stopwords
nltk.download('stopwords')
common_words = set(stopwords.words('english'))
# Add custom stopwords if needed
common_words.update(['not', 'get', 'would', 'like', 'even', 'one', 'go', 'make', 'see', 'say', 'know', 'u', 
                     'well', 'say', 'think', 'like', 'time', 'make', 'look', 'could', 'no', 'come', 
                     'last', 'even', 'still', 'much', 'really', 'want', 'need', 'right', 
                     'take', 'way', 'thing', 'best', 'play', 'give', 'u', 'seem', 'also', 'bit', 'let'])

# Step 2: Filter stopwords from the Lemmatized Content
def remove_stopwords(text):
    return [word for word in text if word not in common_words]

df_samples['Updated Lemmatized Content'] = df_samples['Lemmatized Content'].apply(remove_stopwords)

In [None]:
from gensim import corpora
from gensim.models import LdaModel

# Remove common words 

# Create a dictionary and corpus for LDA
unigram_dictionary = corpora.Dictionary(df_samples['Updated Lemmatized Content'])
corpus = [unigram_dictionary.doc2bow(text) for text in df_samples['Updated Lemmatized Content']]

# Function to get topic distribution for each document
def get_topic_distribution(lda_model, corpus, num_topics):
    topic_distributions = []
    for doc in corpus:
        topic_dist = [0] * num_topics
        for topic_num, prob in lda_model.get_document_topics(doc):
            topic_dist[topic_num] = prob
        topic_distributions.append(topic_dist)
    return topic_distributions

# Build LDA model
num_topics = 5
lda_unigram_model = LdaModel(corpus, num_topics=num_topics, id2word=unigram_dictionary, passes=40, alpha='auto', eta='auto', random_state=42)

In [None]:
# Display the most common topics
topics = lda_unigram_model.print_topics(num_words=15)
for topic in topics:
    print(topic)

In [None]:
# Extract words and their weights for a specific topic (e.g., topic 0)
topic_words = dict(lda_unigram_model.show_topic(4, topn=15))
string =  str(topic_words.keys())

In [None]:
# Remove all single quotation marks
string_without_quotes = string.replace("'", "")

# Print the result
print(string_without_quotes)

In [None]:
from wordcloud import WordCloud
import matplotlib.pyplot as plt

for i in range(num_topics):
    plt.figure()
    topic_words = dict(lda_unigram_model.show_topic(i, topn=20))
    wordcloud = WordCloud(width=800, height=400, background_color='white').generate_from_frequencies(topic_words)
    plt.imshow(wordcloud, interpolation="bilinear")
    plt.axis("off")
    plt.title(f'Topic {i}')
    plt.show()

#### F55: Topic Distribution

In [None]:
# Get topic distribution for each document
df_samples['F55 Topic Distribution'] = get_topic_distribution(lda_unigram_model, corpus, num_topics)

val_corpus = [unigram_dictionary.doc2bow(text) for text in df_validation['Lemmatized Content']]
df_validation['F55 Topic Distribution'] = get_topic_distribution(lda_unigram_model, val_corpus, num_topics)

In [None]:
df_samples_copy = df_samples.copy()
df_validation_copy = df_validation.copy()

In [None]:
# Resetting
df_samples = df_samples_copy.reset_index(drop=True)
df_validation = df_validation_copy.reset_index(drop=True)

### Expanding the features

In [None]:
df_samples = df_samples.reset_index(drop=True)
df_validation = df_validation.reset_index(drop=True)

In [None]:
df_validation.shape

In [None]:
df_validation.columns

In [None]:
df_samples.shape

In [None]:
def expand_and_drop_feature_column(df, feature_column):

    # Expand the dictionary into separate columns
    expanded_df = df[feature_column].apply(pd.Series)
    
    # Arrange the POS tags (column names) in ascending order
    expanded_df = expanded_df.reindex(sorted(expanded_df.columns), axis=1)
    
    # Join the expanded columns back to the original DataFrame
    df = df.join(expanded_df)
    
    # Drop the original feature column
    df = df.drop(columns=[feature_column])
    
    return df

#### F6 POS Tags

In [None]:
from sklearn.preprocessing import MinMaxScaler

# Assuming df_samples and df_validation have the expanded POS features
scaler = MinMaxScaler()

# Expand the dictionary into separate columns
expanded_f6 = df_samples['F6 POS Tags Distribution'].apply(pd.Series)
# Arrange the POS tags (column names) in ascending order
expanded_f6 = expanded_f6.reindex(sorted(expanded_f6.columns), axis=1)

POS_list = list(expanded_f6.columns)

i = 6
new_list = []
for POS in POS_list:
    new = 'F' + str(i) + ' ' + POS
    new_list.append(new)
    i += 1

new_list
rename_dict = dict(zip(POS_list,new_list))

print(rename_dict)

df_samples = expand_and_drop_feature_column(df_samples, 'F6 POS Tags Distribution')
df_validation = expand_and_drop_feature_column(df_validation, 'F6 POS Tags Distribution')

df_samples = df_samples.rename(columns=rename_dict)
df_validation = df_validation.rename(columns=rename_dict)

display(df_samples[new_list].head())


# Fit and transform the df_samples dataset
df_samples[new_list] = scaler.fit_transform(df_samples[new_list])

# Transform the df_validation dataset (using the same scaler fit on the training set)
df_validation[new_list] = scaler.transform(df_validation[new_list])

display(df_samples[new_list].head())

In [None]:
df_samples.columns

#### F43 Punctuation Usage

In [None]:
# Expand the dictionary into separate columns
expanded_f43 = df_samples['F43 Punctuation Usage'].apply(pd.Series)
# Arrange the POS tags (column names) in ascending order
expanded_f43 = expanded_f43.reindex(sorted(expanded_f43.columns), axis=1)

punc_list = list(expanded_f43.columns)

i = 43
new_list = []
for punc in punc_list:
    new = 'F' + str(i) + ' ' + punc
    new_list.append(new)
    i += 1

rename_dict = dict(zip(punc_list,new_list))
print(rename_dict)

df_samples = expand_and_drop_feature_column(df_samples, 'F43 Punctuation Usage')
df_validation = expand_and_drop_feature_column(df_validation, 'F43 Punctuation Usage')

df_samples = df_samples.rename(columns=rename_dict)
df_validation = df_validation.rename(columns=rename_dict)

# Apply normalization to the df_samples dataset
df_samples[new_list] = scaler.fit_transform(df_samples[new_list])

# Apply normalization to the df_validation dataset
df_validation[new_list] = scaler.transform(df_validation[new_list])

#### F55 Topic Distribution

In [None]:
# Expand the dictionary into separate columns
expanded_f55 = df_samples['F55 Topic Distribution'].apply(pd.Series)
# Arrange the POS tags (column names) in ascending order
expanded_f55 = expanded_f55.reindex(sorted(expanded_f55.columns), axis=1)

topic_list = list(expanded_f55.columns)

i = 55
j = 1
new_list = []
for topic in topic_list:
    new = 'F' + str(i) + ' Topic ' + str(j)
    new_list.append(new)
    i += 1
    j += 1

rename_dict = dict(zip(topic_list,new_list))
print(rename_dict)

df_samples = expand_and_drop_feature_column(df_samples, 'F55 Topic Distribution')
df_validation = expand_and_drop_feature_column(df_validation, 'F55 Topic Distribution')

df_samples = df_samples.rename(columns=rename_dict)
df_validation = df_validation.rename(columns=rename_dict)

In [None]:
# Combining F56 Topic 2 and F57 Topic 3
df_samples['F56 Topic 2'] = df_samples['F56 Topic 2'] + df_samples['F57 Topic 3']
df_samples = df_samples.drop(columns=['F57 Topic 3'])

df_validation['F56 Topic 2'] = df_validation['F56 Topic 2'] + df_validation['F57 Topic 3']
df_validation = df_validation.drop(columns=['F57 Topic 3'])

In [None]:
topic_list = ['F58 Topic 4', 'F59 Topic 5']
new_list = ['F57 Topic 3', 'F58 Topic 4']
rename_dict = dict(zip(topic_list,new_list))

df_samples = df_samples.rename(columns=rename_dict)
df_validation = df_validation.rename(columns=rename_dict)

df_samples.columns

### Create New df that contains Features

In [None]:
features_exclude = ['Subject', 'Quote', 'Normalized Content', 'Tokenized Content', 'Lemmatized Content']

features = ['ID', 'Gender', 'Mapped Name', 'Cleaned Content',
       'F1 Word Count', 'F2 Character Count', 'F3 Average Word Length',
       'F4 Vocabulary Richness', 'F5 Sentence Length', 'F6 CC', 'F7 CD', 'F8 DT', 'F9 EX', 'F10 FW', 'F11 IN', 'F12 JJ',
       'F13 JJR', 'F14 JJS', 'F15 LS', 'F16 MD', 'F17 NN', 'F18 NNP',
       'F19 NNPS', 'F20 NNS', 'F21 PDT', 'F22 POS', 'F23 PRP', 'F24 PRP$',
       'F25 RB', 'F26 RBR', 'F27 RBS', 'F28 RP', 'F29 SYM', 'F30 TO', 'F31 UH',
       'F32 VB', 'F33 VBD', 'F34 VBG', 'F35 VBN', 'F36 VBP', 'F37 VBZ',
       'F38 WDT', 'F39 WP', 'F40 WP$', 'F41 WRB',
       'F42 Sentence Complexity', 
       'F43 !', 'F44 \'', 'F45 ,', 'F46 .', 'F47 :', 'F48 ;', 'F49 ?',
       'F50 Readability Score', 'F51 Formality Score', 'F52 Politeness Degree',
       'F53 Sentiment Polarity', 'F54 Subjectivity',
       'F55 Topic 1', 'F56 Topic 2', 'F57 Topic 3', 'F58 Topic 4']

df_features = df_samples[features]
df_validation_features = df_validation[features]

# Fill all NaN values with 0
df_features = df_features.fillna(0)
df_features.columns = df_features.columns.astype(str)

df_validation_features = df_validation_features.fillna(0)
df_validation_features.columns = df_validation_features.columns.astype(str)

In [None]:
df_features.to_csv('features.csv')
df_validation_features.to_csv('validation_features.csv')