In [1]:
import pandas as pd
import re
import nltk

In [2]:
raw_data = pd.read_csv('./data/raw_data.csv')

raw_data

Unnamed: 0,German,French,Italian,Spanish,English
0,"@Darth_Lehrer Mist, durchschaut! \nEgal, ich m...","Au Sénat, Bernard Arnault se dépeint en bienfa...","Pregare insieme, gli uni per gli altri, e darc...","""Pandemia silenciosa"": las infecciones por bac...",Psychologist Robert Coplan studies the concept...
1,@florianklenk Oh was für eine süße Watschelente!,#Présidentielle2022 Le récap politique du jour...,#PreghiamoInsieme per le popolazioni delle Iso...,"""Creo que hay un sesgo masculino de despreciar...",“Things like love and trust and caring simply ...
2,@krizzy4peace 🚇👉🐑,L’ex-otage Ingrid Betancourt précandidate à la...,Oggi #PreghiamoInsieme per coloro che sono in ...,La fiscalía peruana anunció que comenzará una ...,It’s not just temperature. All kinds of factor...
3,@tmigge So viel Stups ist auch wieder doof. 🤷‍♂️,"En exil en France, Kamal Mouzawak partage les ...",La tenerezza non è una questione emotiva o sen...,Hay temores de que Moscú está tratando de divi...,“On days that were warmer than average people ...
4,@tmigge Stupst!,L’Egypte capture un opposant islamiste après l...,"Come i Magi, venuti dall’oriente a Betlemme pe...","Putin amenazó con ""apropiadas medidas técnico-...",“Although people are quite aware of global war...
...,...,...,...,...,...
1995,,,,Un grupo de investigadores confirma el motivo ...,It’s not lazy. It keeps milk to cereal ratios ...
1996,,,,¿Sabías que los suelos son el sumidero de #car...,Don't ever say I waste peanut butter. https://...
1997,,,,Un reciente #análisisdedatos proporcionados po...,On today's 'Hasan...From a Distance' we're try...
1998,,,,¿Cuánto sabes sobre #flores? ¿Serás capaz de a...,Got my home office set up right. https://t.co/...


In [3]:
raw_data.isna().sum()

German     204
French       5
Italian     29
Spanish      0
English      1
dtype: int64

## Some ideas:
- `@` + word + ` ` 
- Lowercase everything

### Remove
- Numbers
- Hashtags
- Links
- Special characters such as *:, (, ), etc.*
- Emojis
- `"RT"`

### Special characters
- \ - " "( )  / : _ * $ + 

In [4]:
languages = raw_data.columns
languages

Index(['German', 'French', 'Italian', 'Spanish', 'English'], dtype='object')

In [5]:
def clean_data(text):
    # Hashtag filter
    no_hashtag = text.apply(lambda x: re.sub('#[\w]+','', x)) 

    # Mention filter
    no_mention = no_hashtag.apply(lambda x: re.sub("@([A-Za-z0-9_ßäöüÄÖÜÇâêîôûéàèùëïç]+)|([^0-9A-Za-z_ßäöüÄÖÜâêîôûéàèùëïç \t])|(\w+:\/\/\S+)",' ', x))

    # Remove RT
    no_rt = no_mention.apply(lambda x: re.sub('RT','', x))

    # Remove numbers
    no_num = no_rt.apply(lambda x: re.sub('[0-9]+','', x))

    # Remove extra underscores
    no_underscore = no_num.apply(lambda x: re.sub('_','', x))

    # Lowercase the words
    lowercase = no_underscore.apply(lambda x: x.lower())

    # Remove traces of ampersand
    no_ampersand = lowercase.apply(lambda x: re.sub(' amp ','', x))

    # Remove leading and trailing whitespaces
    no_space = no_ampersand.apply(lambda x: x.strip())

    # Remove extra spaces in between words
    cleaned = no_space.apply(lambda x: " ".join(x.split()))

    return cleaned

# TODO 
### Condense all the lambda functions into a single one and pass a series as the argument

In [6]:
no_punctuation = pd.DataFrame()

for language in languages:
    sentences = raw_data[language].dropna(axis=0)
    no_punctuation[language] = clean_data(sentences)

In [7]:
def read_stopwords(language):
	nltk.download("stopwords")
	return nltk.corpus.stopwords.words(language.lower())

def remove_stopwords(text, stopwords):
	text = text.split()
	stopwords_removed = [word for word in text if word not in stopwords]
	return stopwords_removed

In [8]:
cleaned_data = pd.DataFrame()

for language in languages:
    # Stopwords
    stopwords = read_stopwords(language)
    
    # Language sentences
    section = no_punctuation[language]
    section_dropped = section.dropna().copy()

    # Filter out stopwords
    no_stopwords = section_dropped.apply(lambda x: remove_stopwords(x, stopwords))

    # "Un-tokenize" the text
    cleaned_text = no_stopwords.apply(lambda x: " ".join([word for word in x]))

    # Save the data as a new column
    cleaned_data[language] = pd.concat([cleaned_data, cleaned_text], axis=1)[language]


[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Johng\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Johng\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Johng\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Johng\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Johng\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [9]:
cleaned_data.head()

Unnamed: 0,German,French,Italian,Spanish,English
0,mist durchschaut egal mach einfach,sénat bernard arnault dépeint bienfaiteur presse,pregare insieme uni altri darci fare insieme c...,pandemia silenciosa infecciones bacterias resi...,psychologist robert coplan studies concept alo...
1,oh süße watschelente,récap politique jour,popolazioni isole tonga colpite giorni scorsi ...,creo sesgo masculino despreciar quejas femenin...,things like love trust caring simply work unde...
2,,ex otage ingrid betancourt précandidate présid...,oggi coloro carcere tenerezza dio raggiunga ca...,fiscal peruana anunci comenzar investigaci n p...,temperature kinds factors shape views decision...
3,stups doof,exil france kamal mouzawak partage saveurs liban,tenerezza questione emotiva sentimentale esper...,temores mosc est tratando dividir desestabiliz...,days warmer average people would give money co...
4,stupst,egypte capture opposant islamiste après atterr...,magi venuti oriente betlemme onorare re messia...,putin amenaz apropiadas medidas técnico milita...,although people quite aware global warming bel...


Save the resulting data to a csv file

In [10]:
cleaned_data.to_csv('./data/cleaned_data.csv', index=False)