In [49]:
# Import libraries
import pandas as pd
import unidecode
from nltk.tokenize import RegexpTokenizer
from nltk.corpus import stopwords
from nltk.stem import SnowballStemmer
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.ensemble import RandomForestClassifier
from sklearn import metrics
from sklearn.metrics import confusion_matrix
import random
import math
import statistics 
from statistics import mode 

# NLP
stemmer = SnowballStemmer('spanish')
tokenizer = RegexpTokenizer(r'\w+')

In [None]:
def process_tweet(sentence):
    """ Process sentece by deleting accents, lowercasing, stemming, 
    deleting stopwords and cleaning not useful text"""
    sentence = sentence.lower()
    sentence = unidecode.unidecode(sentence)
    words = sentence.split()
    if len(words) > 1:
        if words[0].lower() == "rt" and "@" in words[1]:
            del words[1]
    sentence = " ".join(words)
    words = tokenizer.tokenize(sentence)
    words = [w for w in words if not w in stopwords.words("spanish")]
    words = [w for w in words if ("rt" != w.lower()) if (not "http" in w)]     # Delete words that contain mentions or the RT word 
    words = [stemmer.stem(w) for w in words] # Lemmatization spanish
    sentence = " ".join(words)
    return sentence

In [None]:
# Import dataset
df_tweets= pd.read_csv("tweet_dataset.csv")

In [None]:
# Load CAs
Andalusia = ["Andalusia", "37.3399964,-4.5811614,250km"]
Madrid = ["Madrid", "40.5248319,-3.7715628,60km"]
Catalonia = ["Catalonia", "41.8523094,1.5745043,150km"]
Canary_Islands = ["Canary_Islands", "28.5306525,-15.7464439,400km"]
Basque_Country = ["Basque_Country", "42.9911816,-2.5543023,100km"]
#Extremadura = 
#CastillaLaMancha = 
#CastillaLeon = 
#Cantabria = 
#ComunidadValenciana = 
#Aragon = 
#LaRioja = 
#Navarra = 
#Asturias = 
#Murcia = 

CAS = [Andalusia, Madrid, Catalonia, Basque_Country, Canary_Islands]

In [None]:
# Process tweets from dataset
processed_dataset = []
n_rows = len(df_tweets["location"])
actual_row = 1
for CA in CAS:
    for index, row in df_tweets[df_tweets["location"] == CA[0]].iterrows(): 
        sentence = process_tweet(row[1])
        processed_dataset.append([row[0], sentence, row[2]])   
        if actual_row%100 == 0:
            print("Row {} of {}".format(actual_row, n_rows))
        actual_row += 1
        

In [None]:
# Transform to dataframe
df_processed = pd.DataFrame(processed_dataset, columns = ["id", "tweet", "location"])

In [None]:
# Save dataframe as csv
df_processed.to_csv("tweet_dataset_processed.csv", encoding='utf-8-sig', index = False)

In [2]:
df_processed = pd.read_csv("tweet_dataset_processed.csv")

In [3]:
df_processed.head()

Unnamed: 0,id,tweet,location
0,3187391489,9 manan cojon dia,Andalusia
1,3187391489,unic mied futur pas amistad,Andalusia
2,3187391489,i can t stop watching this specific tiktok t c...,Andalusia
3,3187391489,oye dibuj bien quier hac plan cn algui q guach...,Andalusia
4,3187391489,punt apart,Andalusia


# Dummy prediction

In [67]:
# Shuffle all unique ids
df_processed = df_processed.dropna()
ids = list(df_processed["id"].unique())
ids_shuffled = random.sample(ids, len(ids))

In [68]:
# Split ids in train/test 0.9/0.1 approximately
print(len(df_processed["id"].unique()))
ids_train = ids_shuffled[:228]
ids_test = ids_shuffled[228:]

254


In [69]:
# Create train and test
df_train = df_processed[df_processed['id'].isin(ids_train)]
df_test = df_processed[df_processed['id'].isin(ids_test)]

X_train = df_train["tweet"]
X_test = df_test["tweet"]
y_train = df_train["location"]
y_test = df_test["location"]

In [71]:
# Transform tweets with TFIDF vectorizer
X_train = vectorizer.fit_transform(X_train)
X_test = vectorizer.transform(X_test)

In [73]:
# Dummy prediction: Checking accuracy and confusion matrix with Random Forest Classifier
# We are predicting just individual tweets
model = RandomForestClassifier()
model.fit(X_train,y_train)
prediction = model.predict(X_test)
#accuracy = metrics.accuracy_score(prediction,y_test)
#print("Accuracy: ", accuracy)
#print("Confusion matrix: ", confusion_matrix(y_test, prediction))



Accuracy:  0.3126923076923077
Confusion matrix:  [[388 260 113  30 309]
 [106 112  54  20 108]
 [151 112  92  24 121]
 [  0   0   0   0   0]
 [173 143  47  16 221]]


In [99]:
correct_pred = 0
for id_num in ids_test:
    pred = mode(df_test[df_test["id"] == id_num]["predictions"])
    real = mode(df_test[df_test["id"] == id_num]["location"])
    if real == pred:
        correct_pred = correct_pred + 1

print("Correct prediction rate {}".format(correct_pred/len(ids_test)))

Correct prediction rate 0.5384615384615384
