In [1]:
# Import libraries
import pandas as pd
from nltk.tokenize import RegexpTokenizer
from nltk.corpus import stopwords
from nltk.stem import SnowballStemmer
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.svm import LinearSVC
from sklearn.ensemble import RandomForestClassifier
from sklearn import metrics
from sklearn.metrics import confusion_matrix
from collections import Counter
import random
import math
from statistics import mode 
import itertools

# TFIDF
vectorizer = TfidfVectorizer()

In [2]:
def calculate_accuracy_user(model_name):
    correct_pred = 0
    for id_num in ids_test:
        c = Counter(df_test[df_test["id"] == id_num]["predictions"])  
        mode_count = max(c.values())
        mode = {key for key, count in c.items() if count == mode_count}
        pred = next(x for x in df_test[df_test["id"] == id_num]["predictions"] if x in mode)
        #real = mode(df_test[df_test["id"] == id_num]["location"])
        lst  =(list(df_test[df_test["id"] == id_num]["location"]))
        real = max(set(lst), key=lst.count)
        if real == pred:
            correct_pred = correct_pred + 1

    print("Accuracy for {}: {}".format(model_name, correct_pred/len(ids_test)))

In [3]:
df_processed = pd.read_csv("tweet_dataset_processed_ext.csv")

In [4]:
df_processed.head()

Unnamed: 0,id,tweet,location
0,625707906,dia ayer amig cit emple call av irigoy 1334 av...,Andalusia
1,625707906,shaval shaval voy bedford ano vien t co aoxmpm...,Andalusia
2,625707906,hac falt pens sab llev segund fot pulser lumin...,Andalusia
3,625707906,imagin ser asi ridicul pens tod luc movil cans...,Andalusia
4,625707906,k recuerd d bibi limit t co ozagbzlngw,Andalusia


# Dummy prediction

In [5]:
# Shuffle all unique ids
df_processed = df_processed.dropna()
ids = list(df_processed["id"].unique())
ids_shuffled = random.sample(ids, len(ids))

In [6]:
# Split ids in train/test 0.9/0.1 approximately
# We should predict multiple times with randomly shuffled data to avoid overfitting
print(len(df_processed["id"].unique()))
ids_train = ids_shuffled[:500]
ids_test = ids_shuffled[500:]

553


In [7]:
# Create train and test
df_train = df_processed[df_processed['id'].isin(ids_train)]
df_test = df_processed[df_processed['id'].isin(ids_test)]

X_train = df_train["tweet"]
X_test = df_test["tweet"]
y_train = df_train["location"]
y_test = df_test["location"]

In [None]:
# Transform tweets with TFIDF vectorizer
X_train = vectorizer.fit_transform(X_train)
X_test = vectorizer.transform(X_test)

In [None]:
# Dummy prediction: Random Forest Classifier
# We are predicting just individual tweets

models = [["Random Forest", RandomForestClassifier()], ["Linear SVC", LinearSVC()], ["Logistic Reg", LogisticRegression()]]

for sel_model in models:
    model_name, model = sel_model
    model.fit(X_train,y_train)
    prediction = model.predict(X_test)
    df_test["predictions"] = prediction
    calculate_accuracy_user(model_name)
     

In [None]:
# For dummy classification, considering only the text tweet as input we get barely a 38% of accuracy
# To try to improve on this, we will consider new features to the classification
# The number of times a user mention one of the cities in our least, we consider this can be useful for the prediction
# Therefore, we will proceed to assign to each user the number of times he mention a city in a CA

# Generating new feature: nº times a city (in CA) is mentioned by user

In [23]:
# Load [Name, Coordinates, Radius] of each Autonomous Community]
Andalusia = ["Andalusia", "37.3399964,-4.5811614,250km"]
Madrid = ["Madrid", "40.5248319,-3.7715628,60km"]
Catalonia = ["Catalonia", "41.8523094,1.5745043,150km"]
Canary_Islands = ["Canary_Islands", "28.5306525,-15.7464439,400km"]
Basque_Country = ["Basque_Country", "42.9911816,-2.5543023,100km"]

# Other CAs not used for now
#Extremadura = 
#CastillaLaMancha = 
#CastillaLeon = 
#Cantabria = 
#ComunidadValenciana = 
#Aragon = 
#LaRioja = 
#Navarra = 
#Asturias = 
#Murcia = 

CAS = [Andalusia, Madrid, Catalonia, Basque_Country, Canary_Islands]

CAS_name = [CA[0].replace("_", " ") for CA in CAS]

In [10]:
# Create dataframe with all unique ids and 0 in each of the CAS
df_CA_mentioned = pd.DataFrame(0, index=ids, columns=CAS_name)

In [11]:
# Read tweets not processed
df_tweets = pd.read_csv("tweet_dataset_ext.csv")

# Read cities dataset
df_cities = pd.read_csv('cities.csv', encoding='utf-8')

In [41]:
# Count mentiones of cities for each user in every tweet
ids_CA_mention_count = []
for id_n in ids:
    CA_mention_count = [0,0,0,0,0]
    for index, row in df_tweets[df_tweets["id"] == id_n].iterrows():
        if any(i in row["tweet"].split() for i in list(df_cities[df_cities["admin"] == CAS_name[0]]["city"])):
            CA_mention_count[0] += 1
        if any(i in row["tweet"].split() for i in list(df_cities[df_cities["admin"] == CAS_name[1]]["city"])):
            CA_mention_count[1] += 1
        if any(i in row["tweet"].split() for i in list(df_cities[df_cities["admin"] == CAS_name[2]]["city"])):
            CA_mention_count[2] += 1
        if any(i in row["tweet"].split() for i in list(df_cities[df_cities["admin"] == CAS_name[3]]["city"])):
            CA_mention_count[3] += 1
        if any(i in row["tweet"].split() for i in list(df_cities[df_cities["admin"] == CAS_name[4]]["city"])):
            CA_mention_count[4] += 1
            
    ids_CA_mention_count.append([id_n, CA_mention_count])
    

In [44]:
# Save dataframe as csv
# Transform to dataframe
df_ids_CA_mention_count = pd.DataFrame(ids_CA_mention_count, columns = ["id", "CA_mention_count"])
df_ids_CA_mention_count.to_csv("ids_CA_mention_count.csv", encoding='utf-8-sig', index = False)

In [45]:
df_ids_CA_mention_count = pd.read_csv("ids_CA_mention_count.csv")

In [None]:
list_with_CA_count = []
for index, row in df_processed.iterrows():
    count_selected = [el for el in ids_CA_mention_count if el[0] == row["id"]]
    list_with_CA_count.append(list(row) + count_selected[0][1])

In [49]:
# Transform to dataframe

df_processed_with_CA_count = pd.DataFrame(processed_dataset, columns = ["id", "tweet", "location", "Andalusia_mention", "Madrid_mention", "Catalonia_mention", "Basque_Country_mention", "Canary_Islands_mention"])

# Save dataframe as csv
df_processed_with_CA_count.to_csv("tweets_processed_with_CA_count.csv", encoding='utf-8-sig', index = False)

[[625707906, [1, 0, 0, 0, 0]]]


In [57]:
count_selected[0][1]

[1, 0, 0, 0, 0]