# Collection of Scripts for Exploratory Data Analysis
Author: Johanna Garthe
+ Tweet volume timeline
+ Dataset distribution
+ Max sequence length
+ Under-sampling the majority class at random
+ Check unique number of users per dataset
+ Check for text duplicates written by same user
+ Delete duplicates by text_ID

### Tweet volume timeline

In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import os

dir = " "
fileNames = os.listdir(dir)
fileNames = [file for file in fileNames if '.csv' in file]
for file in fileNames:
    df = pd.read_csv(dir + file, \
        parse_dates=['created_at'])
    #df_tweets = df.loc[df['text_type'] == 'tweet']
    df_grouped = df.groupby(pd.Grouper(key='created_at',freq='1D', convention='start')).size()
    df_period = df_grouped.loc['2022-02-24' : '2022-12-31']
    df_period.plot(figsize=(20,10), label=(file.rsplit('.', 1)[0]))
plt.xticks(fontsize=18)
plt.yticks(fontsize=18)
plt.xlabel('Date of Tweet Creation', fontsize=21)
plt.ylabel('1 Day Tweet Count\n', fontsize=21)
plt.title('Tweet Volume Timeline of Target US\n', fontsize=24)
plt.style.use('fivethirtyeight')
handles, labels = plt.gca().get_legend_handles_labels()
plt.gca().get_lines()[0].set_color("#fc4f30")
plt.gca().get_lines()[1].set_color("#008fd5")
order =[0,1]
plt.gca().legend([handles[idx] for idx in order],[labels[idx] for idx in order],loc='center left', bbox_to_anchor=(1, 0.5), fontsize=18, title_fontsize=21, title='Datasets')
plt.show()

### Dataset distribution

In [None]:
import pandas as pd
from functools import reduce

file_scraped_data = " "
df = pd.read_csv(file_scraped_data)
total_count = df['target'].value_counts()
favor = df.loc[df['label'] == 'FAVOR']
favor_count = favor["target"].value_counts()
against = df.loc[df['label'] == 'AGAINST']
against_count = against["target"].value_counts()
df_allcounts = pd.DataFrame({'Target':total_count.index, 'Total':total_count.values})
df_favcounts = pd.DataFrame({'Target':favor_count.index, 'Favor':favor_count.values})
df_agcounts = pd.DataFrame({'Target':against_count.index, 'Against':against_count.values})
dfs = [df_allcounts, df_agcounts, df_favcounts]
allcounts = reduce(lambda  left, right: pd.merge(left,right,on=['Target'],how='outer'), dfs)
replace_values = {'ukraine support' : 'US','npps operation continuation' : 'NOC','arms delivery' : 'AD','speed limit implementation':'SLI'}
allcounts = allcounts.replace({"Target": replace_values}).sort_values('Target')
allcounts

### Max sequence length

In [None]:
import seaborn as sns
import matplotlib.pyplot as plt
from matplotlib import rcParams
from transformers import AutoTokenizer
import pandas as pd

file = " "
data = pd.read_csv(file)
model_ckpt = " "
tokenizer = AutoTokenizer.from_pretrained(model_ckpt)
token_lens = []
for txt in data.text_cleaned:
  tokens = tokenizer.encode(txt, truncation=True, max_length=512)
  token_lens.append(len(tokens))

rcParams['figure.figsize'] = 12, 8
sns.set(style='whitegrid', palette='muted', font_scale=1.2)
sns.distplot(token_lens)
plt.xlim([0, 256])
plt.xlabel('Token count')
plt.style.use('fivethirtyeight')

#df = pd.DataFrame(token_lens)
#df.describe()

### Under-sampling the majority class at random

In [None]:
import pandas as pd
from sklearn.utils import resample

file_name = " "
df = pd.read_csv(file_name)

""" Downsampling of Class Arms Delivery - Favor """
print('********* Downsampling of Class Arms Delivery - Favor *********')
arms_against = df[(df['target']=='arms delivery') & (df['label']=='FAVOR')]
print(arms_against.shape)
arms_against_downsample = resample(arms_against,
                replace=False,
                n_samples=5500)
print(arms_against_downsample.shape)

""" Downsampling of Class NPPs Operation Continuation - Against """
print('********* Downsampling of Class NPPs Operation Continuation - Against *********')
npps_against = df[(df['target']=='npps operation continuation') & (df['label']=='AGAINST')]
print(npps_against.shape)
npps_against_downsample = resample(npps_against,
                replace=False,
                n_samples=5500)
print(npps_against_downsample.shape)

""" Downsampling of Class Speedlimit Implementation - Favor """
print('********* Downsampling of Class Speedlimit Implementation - Favor *********')
speed_favor= df[(df['target']=='speed limit implementation') & (df['label']=='FAVOR')]
print(speed_favor.shape)
speed_favor_downsample = resample(speed_favor,
                replace=False,
                n_samples=5000)
print(speed_favor_downsample.shape)

""" Downsampling of Class Ukraine Support - Favor """
print('********* Downsampling of Class Ukraine Support - Favor *********')
ua_favor = df[(df['target']=='ukraine support') & (df['label']=='FAVOR')]
print(ua_favor.shape)
ua_favor_downsample = resample(ua_favor,
                replace=False,
                n_samples=5500,)
print(ua_favor_downsample.shape)

In [None]:
# Delete majority class rows of the original dataset dataframe and save in a new
df_del1 = df.drop(df[(df['target']=='arms delivery') & (df['label']=='FAVOR')].index)
df_del2 = df_del1.drop(df_del1[(df_del1['target']=='npps operation continuation') & (df_del1['label']=='AGAINST')].index)
df_del3 = df_del2.drop(df_del2[(df_del2['target']=='speed limit implementation') & (df_del2['label']=='FAVOR')].index)
df_del4 = df_del3.drop(df_del3[(df_del3['target']=='ukraine support') & (df_del3['label']=='FAVOR')].index)
# Concatenate with downsampled set
downsampled = pd.concat([arms_against_downsample, npps_against_downsample, speed_favor_downsample, ua_favor_downsample, df_del4])
print('Original dataset: ',df.shape)
print('Downsampled final dataset: ',downsampled.shape)
# Save in a new csv file to have a final downsampled dataset
downsampled.to_csv(" ", index=False, header=True)

### Check unique number of users per dataset

In [None]:
import pandas as pd

# ---- LOAD FILE ---- #
f_name = "../data/data_unlabeled/all_24feb31dec/3_predictions/stanceclasses/AD - Against.csv"
file = pd.read_csv(f_name)

# ---- SHOW NUMBER OF UNIQUE USERS ---- #
user_count = file["author_id"].nunique()
print('Unique numbers of users by author_id: ', user_count)

### Check for text duplicates by text_ID written by same user

In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import matplotlib as mpl
import numpy as np

# ---- LOAD FILE ---- #
f_name = " "
file = pd.read_csv(f_name)
#file = file.astype({'text_source':'string'})

In [None]:
# ---- SHOW TEXT DUPLICATES AND THE CORRESPONDING TWEET IDS ---- #
texts = file["text"]
text_dupl = file[texts.isin(texts[texts.duplicated()])].sort_values("text_id")
print('Total number of text duplicates: ',len(text_dupl))

In [None]:
# ---- SHOW NUMBER OF UNIQUE SAMPLES OF FOUND TEXT DUPLICATES ---- #
texts_counts = text_dupl["text"].value_counts() #text_source
print('Unique elements in column "text" and their counts in descending order')
print("=" * 100)
print(texts_counts)

In [None]:
# Optional: Save results
text_counts_df = texts_counts.to_frame().reset_index()
text_counts_df.to_csv(" ", index=False, header=True)

In [None]:
uniqueValues = text_dupl['text'].nunique()
print('Unique elements in column "text" :',uniqueValues)

In [None]:
# ---- CHECK IF SAME USER WROTE TEXT DUPLICATES ---- #
author_counts = text_dupl["author_id"].value_counts()
print('Unique elements in column "author_id" and their counts in descending order')
print("=" * 100)
print(author_counts)

In [None]:
# Check out meta data
#file[file.author_id == 1234567].iloc[:1]

In [None]:
author_counts_min2 = [c for c in author_counts if c >= 2]
author_counts_min = pd.Series(author_counts_min2) 
mpl.rcParams['font.size'] = 17.5
cmap = plt.get_cmap("tab20")
colors = cmap(np.arange(146))
author_counts_min.plot.pie(legend=False, labels=None, colors=colors, autopct=lambda p : '{:.0f}%  ({:,.0f})'.format(p,p * sum(author_counts_min)/100) if p > 3 else None, figsize=(30,20))
#plt.style.use('fivethirtyeight')
plt.ylabel('')
plt.title('Distribution of Tweet Authors', fontsize=18)
plt.show()

### Delete duplicates by text_ID

In [None]:
import pandas as pd
f_name = " "
scraped_data = pd.read_csv(f_name)

In [None]:
texts = scraped_data["text_id"]
duplics = scraped_data[texts.isin(texts[texts.duplicated()])].sort_values("text_id")
print('Total number of text duplicates by text_ID: ',len(duplics))

In [None]:
# Delete and keep only first occurence
nodupl = scraped_data.drop_duplicates(subset=['text_id'], keep='first')
nodupl.to_csv(" ", index=False, header=True)