### Data Analysis

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from urllib import request

In [2]:
module_url = f"https://raw.githubusercontent.com/Perez-AlmendrosC/dontpatronizeme/master/semeval-2022/dont_patronize_me.py"
module_name = module_url.split('/')[-1]
print(f'Fetching {module_url}')
#with open("file_1.txt") as f1, open("file_2.txt") as f2
with request.urlopen(module_url) as f, open(module_name,'w') as outf:
  a = f.read()
  outf.write(a.decode('utf-8'))

from dont_patronize_me import DontPatronizeMe
dpm = DontPatronizeMe('./data', 'dontpatronizeme_pcl.tsv')

Fetching https://raw.githubusercontent.com/Perez-AlmendrosC/dontpatronizeme/master/semeval-2022/dont_patronize_me.py


In [3]:
# Define the paths
DATA_PATH = "./data/dontpatronizeme_pcl.tsv"
TRAIN_LABELS_PATH = "./data/train_semeval_parids-labels.csv"
DEV_LABELS_PATH = "./data/dev_semeval_parids-labels.csv"

In [4]:
# Load data
pcl_data = pd.read_csv(DATA_PATH, sep="\t", header=None, skiprows=3)
train_labels = pd.read_csv(TRAIN_LABELS_PATH, sep=",")
dev_labels = pd.read_csv(DEV_LABELS_PATH, sep=",")

In [5]:
# Some basic information about the data
print('#'*50)
print('PCL Data')
print('#'*50)
print(pcl_data.info())
print('#'*50)
print('Train Labels')
print('#'*50)
print(train_labels.info())
print('#'*50)
print('Dev Labels')
print('#'*50)
print(dev_labels.info())

##################################################
PCL Data
##################################################
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10469 entries, 0 to 10468
Data columns (total 6 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   0       10469 non-null  int64 
 1   1       10469 non-null  object
 2   2       10469 non-null  object
 3   3       10469 non-null  object
 4   4       10468 non-null  object
 5   5       10469 non-null  int64 
dtypes: int64(2), object(4)
memory usage: 490.9+ KB
None
##################################################
Train Labels
##################################################
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8375 entries, 0 to 8374
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   par_id  8375 non-null   int64 
 1   label   8375 non-null   object
dtypes: int64(1), object(1)
memory usage: 131.0+ KB
None
##############

In [6]:
# Get the data form of task 1
dpm.load_task1()
pcl_data = dpm.train_task1_df
pcl_data['text'] = pcl_data['text'].dropna()
print('PCL Data')
print('#'*50)
print(pcl_data.info())
print('#'*50)
print(pcl_data.head())

PCL Data
##################################################
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10469 entries, 0 to 10468
Data columns (total 7 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   par_id      10469 non-null  object
 1   art_id      10469 non-null  object
 2   keyword     10469 non-null  object
 3   country     10469 non-null  object
 4   text        10469 non-null  object
 5   label       10469 non-null  int64 
 6   orig_label  10469 non-null  object
dtypes: int64(1), object(6)
memory usage: 572.7+ KB
None
##################################################
  par_id      art_id    keyword country  \
0      1  @@24942188   hopeless      ph   
1      2  @@21968160    migrant      gh   
2      3  @@16584954  immigrant      ie   
3      4   @@7811231   disabled      nz   
4      5   @@1494111    refugee      ca   

                                                text  label orig_label  
0  We 're living in times of abs

In [7]:
# Get the train and dev data
# Extract the par_ids from the data
def extract_data(data, labels):
    rows = []
    for idx in range(len(labels)):
        par_id = labels.par_id[idx]
        keyword = data.loc[data.par_id == par_id].keyword.values[0]
        text = data.loc[data.par_id == par_id].text.values[0]
        label = data.loc[data.par_id == par_id].label.values[0]

        rows.append({
            'par_id': par_id,
            'keyword': keyword,
            'text': text,
            'label': label
        })
    return pd.DataFrame(rows)

# In order to remain the dev data for final testing, we will split the train data into train and val data
train_labels.par_id = train_labels.par_id.astype(str)
dev_labels.par_id = dev_labels.par_id.astype(str)

train_data = extract_data(pcl_data, train_labels)
test_data = extract_data(pcl_data, dev_labels)

# Split the train data into train and val data
train_data, val_data = train_test_split(train_data, test_size=int(len(test_data) / 2), random_state=42, stratify=train_data['label'])
print(f"Train data: {len(train_data)}")
print(f"Val data: {len(val_data)}")
print(f"Dev data: {len(test_data)}")

if len(pcl_data) == len(train_data) + len(val_data) + len(test_data):
    print("Data split successfully!")
else:
    print("Data split failed!")

Train data: 7328
Val data: 1047
Dev data: 2094
Data split successfully!


In [8]:
# Clear out the None values
def clear_none(data):
    for idx, row in data.iterrows():
        if row.text == "":
            data.drop(idx, inplace=True)

clear_none(train_data)
clear_none(val_data)
clear_none(test_data)

print(f"Train data: {len(train_data)}")
print(f"Val data: {len(val_data)}")
print(f"Test data: {len(test_data)}")

Train data: 7328
Val data: 1047
Test data: 2093


In [9]:
# Save the split data
train_data.to_csv("./data/train_data.csv", index=False)
val_data.to_csv("./data/val_data.csv", index=False)
test_data.to_csv("./data/test_data.csv", index=False)

print("Data saved successfully!")
print("-"*50)
print(f"Train data:\n {train_data.head()}")
print("-"*50)
print(f"Val data:\n {val_data.head()}")
print("-"*50)
print(f"Test data:\n {test_data.head()}")

Data saved successfully!
--------------------------------------------------
Train data:
      par_id   keyword                                               text  \
4932   4577  disabled  Miller , out since straining his left hamstrin...   
4987   4636   in-need  The prologue talks about four friends , who ar...   
4759   4391  disabled  He said the home for disabled children relied ...   
7054   6924  hopeless  Richard was referring to the Battle of Greece ...   
4041   3597     women  It is thought that he may have been involved o...   

      label  
4932      0  
4987      0  
4759      0  
7054      0  
4041      0  
--------------------------------------------------
Val data:
      par_id  keyword                                               text  label
6445   6251    women  Far Eastern University rallied from a set down...      0
1286    548  refugee  """ At the moment we do not have plans to rece...      0
1722   1041    women  For example , a business selling a product aim...