In [None]:
import os
import pandas as pd
from matplotlib import pyplot as plt

data_dir = "data"
if not os.path.exists(data_dir):
  os.makedirs(data_dir)

plot_dir = "plots"
if not os.path.exists(plot_dir):
  os.makedirs(plot_dir)

speech_df = pd.read_csv(os.path.join(data_dir, "messages.csv"))

aggressive = speech_df[speech_df["Aggressive"] == 1]
non_aggressive = speech_df[speech_df["Aggressive"] == 0]

print(f'Aggressive Samples: {len(aggressive)}, Non-Aggressive Samples: {len(non_aggressive)}')
print(f'Total Samples: {len(speech_df)}')

print(f'Aggressive Percentage: {len(aggressive) / len(speech_df) * 100}')
print(f'Non-Aggressive Percentage: {len(non_aggressive) / len(speech_df) * 100}')

aggressive_sample = list(aggressive.sample(5)["Message"])
non_aggressive_sample = list(non_aggressive.sample(5)["Message"])

print(f'Aggressive Message Samples')
for message in aggressive_sample:
  print(f'\t{message}')

print(f'Non-Aggressive Message Samples')
for message in non_aggressive_sample:
  print(f'\t{message}')

average_length = speech_df["Message"].str.len().mean()
print(f'Average Message Length: {average_length}')

average_length_aggressive = aggressive["Message"].str.len().mean()
average_length_non_aggressive = non_aggressive["Message"].str.len().mean()
print(f'Average Aggressive Message Length: {average_length_aggressive}')
print(f'Average Non-Aggressive Message Length: {average_length_non_aggressive}')

average_word_count = speech_df["Message"].str.split().str.len().mean()
print(f'Average Word Count: {average_word_count}')

average_word_count_aggressive = aggressive["Message"].str.split().str.len().mean()
average_word_count_non_aggressive = non_aggressive["Message"].str.split().str.len().mean()
print(f'Average Aggressive Word Count: {average_word_count_aggressive}')
print(f'Average Non-Aggressive Word Count: {average_word_count_non_aggressive}')

plt.figure(figsize=(12, 12))
plt.boxplot([aggressive["Message"].str.split().str.len(), non_aggressive["Message"].str.split().str.len()], tick_labels=["Aggressive", "Non-Aggressive"], widths=0.6)
plt.yscale("log")
plt.title("Message Length by Aggersiveness", fontsize=22)
plt.ylabel("Message Length", fontsize=18)
plt.xticks(fontsize=16)
plt.yticks(fontsize=18)
plt.savefig(os.path.join(plot_dir, "message_length_boxplot.png"))
plt.show()

aggressive_word_counts = speech_df[speech_df["Aggressive"] == 1]["Message"].str.split().explode().value_counts()
print(f'Top 30 Aggressive Words')
for word, count in aggressive_word_counts.head(30).items():
  print(f'\t{word}: {count}')

Aggressive Samples: 34613, Non-Aggressive Samples: 93825
Total Samples: 128438
Aggressive Percentage: 26.949189492206354
Non-Aggressive Percentage: 73.05081050779364
Aggressive Message Samples
	RT KingAreed Call me sexist or whatever but if ur my gfwomanwife keeping the houseexception our room laundry amp folding clothes is a M
	You Suck hers why  Your Spanish name your shitbrown attitude your outright stupiditythe list goes on and on Youre probably one of those silly shitbags who gets nailed to a cross on Easter as well Slurp shit and die
	Turkey is Islamist Let them pay the price for Islamofasicsm Europe and US are secular Dont want Islamolunatics
	Arun Jaitley Sir kindly have a proper implementation programme before making any plans Seems the govt is in a hurry to do everything at one point of time
	house you are pathetic
Non-Aggressive Message Samples
	They need at least a  to stay in the MKR competition Can Katie amp Nikki survive elimination httptcoZOPwHrDT
	Actually Jackson said