In [1]:
# Import libraries
import pandas as pd
from nltk.tokenize import RegexpTokenizer
from nltk.corpus import stopwords
from nltk.stem import SnowballStemmer
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.svm import LinearSVC
from sklearn.ensemble import RandomForestClassifier
from sklearn import metrics
from sklearn.metrics import confusion_matrix
from collections import Counter
import random
import math
from statistics import mode 
import itertools
import numpy as np

# TFIDF
vectorizer = TfidfVectorizer()

In [2]:
def calculate_accuracy_user(model_name):
    correct_pred = 0
    for id_num in ids_test:
        c = Counter(df_test[df_test["id"] == id_num]["predictions"])  
        mode_count = max(c.values())
        mode = {key for key, count in c.items() if count == mode_count}
        pred = next(x for x in df_test[df_test["id"] == id_num]["predictions"] if x in mode)
        #real = mode(df_test[df_test["id"] == id_num]["location"])
        lst  =(list(df_test[df_test["id"] == id_num]["location"]))
        real = max(set(lst), key=lst.count)
        if real == pred:
            correct_pred = correct_pred + 1

    print("Accuracy for {}: {}".format(model_name, correct_pred/len(ids_test)))

In [3]:
df_processed = pd.read_csv("tweet_dataset_processed_ext.csv")

In [9]:
len(df_processed)

45212

# Prediction

In [10]:
# Shuffle all unique ids
df_processed = df_processed.dropna()
ids = list(df_processed["id"].unique())
ids_shuffled = random.sample(ids, len(ids))

In [11]:
# Split ids in train/test 0.9/0.1 approximately
# We should predict multiple times with randomly shuffled data to avoid overfitting
print(len(df_processed["id"].unique()))
ids_train = ids_shuffled[:500]
ids_test = ids_shuffled[500:]

559


In [12]:
# Create train and test
df_train = df_processed[df_processed['id'].isin(ids_train)]
df_test = df_processed[df_processed['id'].isin(ids_test)]

X_train = df_train["tweet"]
X_test = df_test["tweet"]
y_train = df_train["location"]
y_test = df_test["location"]

In [13]:
# Transform tweets with TFIDF vectorizer
X_train = vectorizer.fit_transform(X_train)
X_test = vectorizer.transform(X_test)

In [14]:
# Dummy prediction: Random Forest Classifier
# We are predicting just individual tweets

models = [["Random Forest", RandomForestClassifier()], ["Linear SVC", LinearSVC()], ["Logistic Reg", LogisticRegression()]]

for sel_model in models:
    model_name, model = sel_model
    model.fit(X_train,y_train)
    prediction = model.predict(X_test)
    df_test["predictions"] = prediction
    calculate_accuracy_user(model_name)
     

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  # Remove the CWD from sys.path while we load stuff.


Accuracy for Random Forest: 0.4406779661016949
Accuracy for Linear SVC: 0.576271186440678




Accuracy for Logistic Reg: 0.5084745762711864


# Generating new feature: nº times a city (in CA) is mentioned by user

In [15]:
# Load [Name, Coordinates, Radius] of each Autonomous Community]
Andalusia = ["Andalusia", "37.3399964,-4.5811614,250km"]
Madrid = ["Madrid", "40.5248319,-3.7715628,60km"]
Catalonia = ["Catalonia", "41.8523094,1.5745043,150km"]
Canary_Islands = ["Canary_Islands", "28.5306525,-15.7464439,400km"]
Basque_Country = ["Basque_Country", "42.9911816,-2.5543023,100km"]

CAS = [Andalusia, Madrid, Catalonia, Basque_Country, Canary_Islands]

CAS_name = [CA[0].replace("_", " ") for CA in CAS]

In [16]:
# Create dataframe with all unique ids and 0 in each of the CAS
df_CA_mentioned = pd.DataFrame(0, index=ids, columns=CAS_name)

In [17]:
# Read tweets not processed
df_tweets = pd.read_csv("tweet_dataset_ext.csv")

# Read cities dataset
df_cities = pd.read_csv('cities.csv', encoding='utf-8')

In [23]:
# Count mentiones of cities for each user in every tweet
ids_CA_mention_count = []
for id_n in ids:
    CA_mention_count = [0,0,0,0,0]
    for index, row in df_tweets[df_tweets["id"] == id_n].iterrows():
        if any(i in row["tweet"].split() for i in list(df_cities[df_cities["admin"] == CAS_name[0]]["city"])):
            CA_mention_count[0] += 1
        if any(i in row["tweet"].split() for i in list(df_cities[df_cities["admin"] == CAS_name[1]]["city"])):
            CA_mention_count[1] += 1
        if any(i in row["tweet"].split() for i in list(df_cities[df_cities["admin"] == CAS_name[2]]["city"])):
            CA_mention_count[2] += 1
        if any(i in row["tweet"].split() for i in list(df_cities[df_cities["admin"] == CAS_name[3]]["city"])):
            CA_mention_count[3] += 1
        if any(i in row["tweet"].split() for i in list(df_cities[df_cities["admin"] == CAS_name[4]]["city"])):
            CA_mention_count[4] += 1
            
    ids_CA_mention_count.append([id_n, CA_mention_count, row["location"]])
    

In [26]:
# Save dataframe as csv
# Transform to dataframe
df_ids_CA_mention_count = pd.DataFrame(ids_CA_mention_count, columns = ["id", "CA_mention_count", "location"])
#df_ids_CA_mention_count.to_csv("ids_CA_mention_count.csv", encoding='utf-8-sig', index = False)

In [64]:
for CA in CAS:
    count = [0,0,0,0,0]
    for index, row in df_ids_CA_mention_count[df_ids_CA_mention_count["location"] == CA[0]].iterrows():
        count = np.add(count, row["CA_mention_count"])
    print("\nFor {}:".format(CA[0]))
    for CA_print, count_print in zip(CAS, count):
        print("    {} counts {}".format(CA_print[0],count_print))


For Andalusia:
    Andalusia counts 89
    Madrid counts 41
    Catalonia counts 13
    Basque_Country counts 1
    Canary_Islands counts 0

For Madrid:
    Andalusia counts 12
    Madrid counts 142
    Catalonia counts 17
    Basque_Country counts 1
    Canary_Islands counts 0

For Catalonia:
    Andalusia counts 16
    Madrid counts 47
    Catalonia counts 133
    Basque_Country counts 2
    Canary_Islands counts 0

For Basque_Country:
    Andalusia counts 18
    Madrid counts 87
    Catalonia counts 15
    Basque_Country counts 44
    Canary_Islands counts 0

For Canary_Islands:
    Andalusia counts 1
    Madrid counts 52
    Catalonia counts 17
    Basque_Country counts 4
    Canary_Islands counts 0
