# data cleaning


- data source: https://www.kaggle.com/datasets/jp797498e/twitter-entity-sentiment-analysis

In [1]:
import re
import pandas as pd

In [2]:
LABELS_MAPPING = {'positive': 1, 'neutral': 2, 'negative': 3, 'irrelevant': 4}
LABELS_MAPPING_REV = {1: 'positive', 2: 'neutral', 3: 'negative', 4: 'irrelevant'}

def normalise(x: str):
  x = ' '.join(str(x).split()).lower()
  return str(x)

def clean( x : str):
  # replace all & with and
  x = re.sub(r"[&]", "and", str(x))
  # remove all urls with https?
  x = re.sub(r"https?://\S+", " ", str(x))
  # remove words starting with pic.twitter
  x = re.sub(r'\bpic\.\S*', " ", str(x))
  # remove all words starting with twitch.tv
  x = re.sub(r"\btwitch\.tv\S*", " ", str(x))
  # remove words with .com
  x = re.sub(r"\b\w*\.com\w*\b", " ", str(x))
  # remove usernames from tweets, starting with @
  x = re.sub(r"\@\w*", " ", str(x))
  # consider to remove hashtags
  # # #
  # keep only specified letters
  x = re.sub(r"[^a-zA-Z0-9\s@-]", " ", str(x))
  x = ' '.join(x.split())
  return str(x)

def preprocess(data: pd.DataFrame):
  # keep important features
  data = data[['label', 'text']]
  # normalise text data
  data['text'] = data['text'].apply(normalise)
  # clean text
  data['text'] = data['text'].apply(clean)
  # drop duplicates
  data.drop_duplicates( inplace=True )
  # drop na || null rows based on text column
  data.dropna(subset=['text'], inplace=True)
  # keep only rows with letters data
  data['has_letters'] = data['text'].apply( lambda x: "yes" if re.search("[a-zA-Z]", str(x)) else 'no' )
  data = data[ data['has_letters'] == 'yes' ]
  # keep words with 2 or more words only
  data['word_words'] = data['text'].apply( lambda x: len( str(x).split() ) )
  data = data[ data['word_words'] > 1 ]
  # normalise labels
  data['label'] = data['label'].apply(normalise)
  # map labels to numbers
  data['targets'] = data['label'].map(LABELS_MAPPING)
  # drop null
  
  return data[['label', 'text', 'targets']]

## train data

In [3]:
train = pd.read_csv("./data/twitter_training.csv", header=None, names=['number', 'unknown', 'label', 'text'])
train.head()

Unnamed: 0,number,unknown,label,text
0,2401,Borderlands,Positive,im getting on borderlands and i will murder yo...
1,2401,Borderlands,Positive,I am coming to the borders and I will kill you...
2,2401,Borderlands,Positive,im getting on borderlands and i will kill you ...
3,2401,Borderlands,Positive,im coming on borderlands and i will murder you...
4,2401,Borderlands,Positive,im getting on borderlands 2 and i will murder ...


In [None]:
train_clean = preprocess(train)


In [5]:
train_clean.head()

Unnamed: 0,label,text,targets
0,positive,im getting on borderlands and i will murder yo...,1
1,positive,i am coming to the borders and i will kill you...,1
2,positive,im getting on borderlands and i will kill you all,1
3,positive,im coming on borderlands and i will murder you...,1
4,positive,im getting on borderlands 2 and i will murder ...,1


In [6]:
# x = list(train_clean['text'].values)

In [7]:
train_clean.to_csv("./data/train.csv", index=False )

## validation data

In [8]:
val = pd.read_csv("./data/twitter_validation.csv", header=None, names=['', 'blah', 'label','text' ])
val.head()

Unnamed: 0,Unnamed: 1,blah,label,text
0,3364,Facebook,Irrelevant,I mentioned on Facebook that I was struggling ...
1,352,Amazon,Neutral,BBC News - Amazon boss Jeff Bezos rejects clai...
2,8312,Microsoft,Negative,@Microsoft Why do I pay for WORD when it funct...
3,4371,CS-GO,Negative,"CSGO matchmaking is so full of closet hacking,..."
4,4433,Google,Neutral,Now the President is slapping Americans in the...


In [None]:
val_clean = preprocess(val)


In [10]:
val_clean.head()

Unnamed: 0,label,text,targets
0,irrelevant,i mentioned on facebook that i was struggling ...,4
1,neutral,bbc news - amazon boss jeff bezos rejects clai...,2
2,negative,why do i pay for word when it functions so poo...,3
3,negative,csgo matchmaking is so full of closet hacking ...,3
4,neutral,now the president is slapping americans in the...,2


In [11]:
val_clean.to_csv("./data/val.csv", index=False)