In [1]:
import pandas as pd
import re
import nltk

In [2]:
raw_data = pd.read_csv('./data/raw_data.csv')

raw_data

Unnamed: 0,German,French,Italian,Spanish,English
0,"@MeehrChaos @neSeemannsbraut Naja, ich war ger...",Un début d’accalmie sur le front des incendies...,Siamo al mondo per vivere una storia d’amore c...,"Cuando el escritor estaba en el escenario, el ...","We’re told not to dwell on the past, but is no..."
1,"@FrauZimt „Ich finde, in der Stadt hält man da...",Etats-Unis : le plan climat et santé de Joe Bi...,"Chi si crede ricco, vincente e sicuro, fonda t...",El caso de Lee reafirma la concepción popular ...,"Despite our intuition, Wilson says we're not a..."
2,"„Ach, diese Hitzewellen, dieser Klimawandel, s...","Marchés d’Europe, Stevenson, luttes féministes...",La vecchiaia è la fase della vita più adatta a...,La orden de registro al hogar del expresidente...,The results? The women who saw the profile wer...
3,"@regenmuseum Harte, äh, trockene Zeiten… 😢","Comment Ginko Financial, la plus célèbre banqu...",Quanto è prezioso quel senso di familiarità e ...,"La novela ""Los versos satánicos"", de Salman Ru...",How well do you know yourself?\n\nIn a speed d...
4,@rainerklute 💡,L’actrice américaine Anne Heche déclarée morte...,La rinascita di un dialogo passa non dalle par...,"En Corea del Sur, los conglomerados gigantes d...","""I feel like she's a guardian angel of my subc..."
...,...,...,...,...,...
1995,#Vertipper des Tages: Sorgiment. \n\n😳,RT @Sante_Gouv: #Canicule | Comment se rafraîc...,"""La prima cosa che mi ha colpito è stata l'ema...",El Sol esconde enormes asteroides que no vemos...,RT @patriotact: Brand new episode of Patriot A...
1996,Thread! 👇 https://t.co/YsLyXezyTq,RT @Sante_Gouv: #Monkeypox | Quels sont les sy...,"""Un immediato colpo di fulmine, un prodigio, i...",Lombrices y camarones fueron los primeros en r...,RT @patriotact: Hasan has a pitch for how to r...
1997,"RT @SlenderSherbet: When assembling your dog, ...",RT @francediplo: #OTAN | Le Parlement 🇫🇷 a rat...,"""Un romanzo con una voce, che esprime subito u...",Descubren uno de mayores diamantes rosas de to...,America needs a commissioner: Adam Silver. htt...
1998,"RT @Sophia165540956: Das ist Olga,12Jahre alt....",RT @Sante_Gouv: #Vaccination | 2ème dose de ra...,,Los mejores secadores de pelo para llevarte en...,


In [3]:
# Check out any null values
raw_data.isna().sum()

German     1
French     0
Italian    2
Spanish    0
English    2
dtype: int64

## Some Ideas:
- Lowercase everything

### Remove
- `@` + word + ` ` 
- Numbers
- Hashtags
- Links
- Special characters such as *:, (, ), etc.*
- Emojis
- `"RT"`

### Special characters
- \ - " "( )  / : _ * $ + 

In [4]:
languages = raw_data.columns
languages

Index(['German', 'French', 'Italian', 'Spanish', 'English'], dtype='object')

In [5]:
def clean_data(text):
    # Hashtag filter
    no_hashtag = text.apply(lambda x: re.sub('#[\w]+','', x)) 

    # Mention filter
    no_mention = no_hashtag.apply(lambda x: re.sub("@([A-Za-z0-9_ßäöüÄÖÜÇâêîôûéàèùëïç]+)|([^0-9A-Za-z_ßäöüÄÖÜâêîôûéàèùëïç \t])|(\w+:\/\/\S+)",' ', x))

    # Remove RT
    no_rt = no_mention.apply(lambda x: re.sub('RT','', x))

    # Remove numbers
    no_num = no_rt.apply(lambda x: re.sub('[0-9]+','', x))

    # Remove extra underscores
    no_underscore = no_num.apply(lambda x: re.sub('_','', x))

    # Lowercase the words
    lowercase = no_underscore.apply(lambda x: x.lower())

    # Remove traces of ampersand
    no_ampersand = lowercase.apply(lambda x: re.sub(' amp ','', x))

    # Remove leading and trailing whitespaces
    no_space = no_ampersand.apply(lambda x: x.strip())

    # Remove extra spaces in between words
    cleaned = no_space.apply(lambda x: " ".join(x.split()))

    return cleaned

# TODO 
### Condense all the lambda functions into a single one and pass a series as the argument

In [6]:
no_punctuation = pd.DataFrame()

for language in languages:
    sentences = raw_data[language].dropna(axis=0)
    no_punctuation[language] = clean_data(sentences)

In [7]:
def read_stopwords(language):
    nltk.download("stopwords")
    return nltk.corpus.stopwords.words(language.lower())

def remove_stopwords(text, stopwords):
    text = text.split()
    stopwords_removed = [word for word in text if word not in stopwords]
    return stopwords_removed

In [8]:
cleaned_data = pd.DataFrame()

for language in languages:
    # Stopwords
    stopwords = read_stopwords(language)
    
    # Language sentences
    section = no_punctuation[language]
    section_dropped = section.dropna().copy()

    # Filter out stopwords
    no_stopwords = section_dropped.apply(lambda x: remove_stopwords(x, stopwords))

    # "Un-tokenize" the text
    cleaned_text = no_stopwords.apply(lambda x: " ".join([word for word in x]))

    # Save the data as a new column
    cleaned_data[language] = pd.concat([cleaned_data, cleaned_text], axis=1)[language]


[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Johng\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Johng\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Johng\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Johng\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Johng\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [9]:
cleaned_data.head()

Unnamed: 0,German,French,Italian,Spanish,English
0,naja gerade frankreich tatsächlich schrecklich...,début accalmie front incendies france,mondo vivere storia d amore dio abbracciare au...,escritor escenario agresor subi tarima asest r...,told dwell past nostalgia exception weekly new...
1,finde stadt hält gar fahre abends immer kurz w...,etats unis plan climat santé joe biden adopté ...,crede ricco vincente sicuro fonda sé chiude di...,caso lee reafirma concepci n popular l deres e...,despite intuition wilson says actually good pr...
2,ach hitzewellen klimawandel schlimm gleich mal...,marchés europe stevenson luttes féministes rep...,vecchiaia fase vita adatta diffondere lieta no...,orden registro hogar expresidente expone trump...,results women saw profile poor predicting eith...
3,harte äh trockene zeiten,comment ginko financial plus célèbre banque se...,prezioso quel senso familiarità comunità tanto...,novela versos sat nicos salman rushdie conside...,well know speed dating study women rated much ...
4,,actrice américaine anne heche déclarée morte s...,rinascita dialogo passa parole silenzio impunt...,corea sur conglomerados gigantes dominan econo...,feel like guardian angel subconscious remind c...


Save the resulting data to a csv file

In [10]:
cleaned_data.to_csv('./data/cleaned_data.csv', index=False)