In [None]:
# import libraries
import os
import json
import pandas as pd
import plotly_express as px
import numpy as np  
import re 
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder, LabelEncoder, OrdinalEncoder, StandardScaler
from sklearn.compose import make_column_transformer
import seaborn as sns
from sklearn.cluster import KMeans
import matplotlib.pyplot as plt
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense
from keras.optimizers import Adam
import tensorflow as tf
from numpy import genfromtxt
import torch
import transformers
from tqdm.auto import tqdm
from transformers import pipeline, AutoTokenizer, AutoModelForSequenceClassification
import math
from sklearn.metrics.pairwise import haversine_distances
from math import radians
from transformers import XLMRobertaTokenizerFast

## Adding Features to Sample Data

In [None]:
df = pd.read_csv('processed data/df.csv')

In [None]:
# embeddings dataframe
df.head()

In [None]:
df_merge = pd.concat([df.reset_index(drop=True), sent.sent, lan.language, topics.topic, ner.entity], axis=1)

In [None]:
df_merge.head()

In [None]:
df_merge.shape

In [None]:
id_scaler = StandardScaler()
ord_enc = OrdinalEncoder(categories=[['negative', 'neutral', 'positive']])
label_enc = LabelEncoder()

In [None]:
df_merge[['user_id', 'cluster_id']] = id_scaler.fit_transform(df_merge[['user_id', 'cluster_id']])
df_merge['sent'] = ord_enc.fit_transform(df_merge[['sent']])
df_merge['language'] = label_enc.fit_transform(df_merge[['language']])
df_merge['topic'] = label_enc.fit_transform(df_merge[['topic']])
df_merge['entity'] = label_enc.fit_transform(df_merge[['entity']])
 

In [None]:
df_merge.head()

In [None]:
df_merge.corr()[['lat', 'lng']]

In [None]:
df_merge.loc[:, 'year':'second'].head()

In [None]:
df_feat = df_merge.drop(columns=['id', 'lat', 'lng', 'text'])

In [None]:
df_feat.head()

In [None]:
df_feat.shape

In [None]:
#df_feat.to_csv('inputs/df_feat.csv', index=False)

In [None]:
final_df = df_merge.merge(pd.DataFrame(X), left_index=True, right_index=True).drop(columns=['text', 'id'])

In [None]:
final_df.shape

In [None]:
final_df.head()

In [None]:
X_merge_xlm = pd.read_csv('processed data\X_merge_xlm.csv', delimiter=',', header=None)

In [None]:
df_final_merge = pd.DataFrame(X_merge_xlm).merge(df_merge, left_index=True, right_index=True).drop(columns='text')

In [None]:
final_merge_array = np.array(df_final_merge).astype('float64')

In [None]:
final_df.head()

In [None]:
final_df.shape

In [None]:
X_ = final_df.drop(columns=['lat', 'lng'])
y = final_df[['lat', 'lng']]

In [None]:
final_X_array = np.array(X_).astype('float64')
final_y_array = np.array(y).astype('float64')

In [None]:
# train test split
X_train, X_test, y_train, y_test = train_test_split(
    final_X_array, final_y_array, test_size=0.2, random_state=19) # split 20% of data to make validation set

In [None]:
tf.random.set_seed(19)
optimizer = Adam(learning_rate=.0001)
callback = tf.keras.callbacks.EarlyStopping(monitor='loss', patience=5)
reduce_lr = tf.keras.callbacks.ReduceLROnPlateau(monitor='val_loss', factor=0.1,
                              patience=5, min_lr=0.0000001)

# define the model architecture
model = Sequential()
model.add(Dense(8000, activation='relu', input_dim=(X_train.shape[1])))
model.add(Dense(4000, activation='relu'))
model.add(Dense(2000, activation='relu'))
model.add(Dense(1000, activation='relu'))
model.add(Dense(2)) # output layer with 2 units for latitude and longitude

# compile the model
model.compile(optimizer=optimizer, loss=loss_haversine, metrics=['mse'])

# train the model
with tf.device('/GPU:0'):
    history = model.fit(X_train, y_train, epochs=1, batch_size=32, validation_split=0.10, callbacks=[callback, reduce_lr], use_multiprocessing=True)


In [None]:
final_preds = model.predict(X_test)

In [None]:
model.evaluate(X_test, y_test)

In [None]:
# Convert the model history to a pandas DataFrame
df_his = pd.DataFrame(history.history)

# Create separate figures for loss and accuracy
fig_loss = px.line(df_his, x=df_his.index, y=['loss', 'val_loss'], labels={'value': 'Loss', 'index': 'Epoch'}, title='Model Loss')
fig_acc = px.line(df_his, x=df_his.index, y=['mse', 'val_mse'], labels={'value': 'MSE', 'index': 'Epoch'}, title='Model MSE')
fig_lr = px.line(df_his, x=df_his.index, y='lr', labels={'value': 'Learning Rate', 'index': 'Epoch'}, title='Model Learning Rate', log_y=True)

# Show the figures
fig_loss.show()
fig_acc.show()
fig_lr.show()

In [None]:
# convert test set coordinates to radians   
y_test_rad = y_test * (math.pi/180)

In [None]:
# convert prediction coordinates to radians
preds_rad = final_preds * (math.pi/180)

In [None]:
# calculate distance
distances_final = haversine_distances(y_test_rad, preds_rad)[0]
distances_km_final = distances_final * (6371000/1000)

In [None]:
px.bar(distances_km_final, title='Distances Between Actual and Prediction', labels={'value': 'Distance (Km)'}, template='plotly_white')

In [None]:
px.box(distances_km_final, title='Distribution of Distances', labels={'value': 'Distance (Km)'}, template='plotly_white')

## Statistical Testing

    Null Hypothesis: The sample distances differences are similar

In [None]:
from scipy import stats as st
import numpy as np


alpha = 0.05  # critical statistical significance level
# if the p-value is less than alpha, we reject the hypothesis

results = st.ttest_ind(distances_km, distances_km_final)

print('p-value: ', results.pvalue)

if results.pvalue < alpha:
    print("We reject the null hypothesis, the sample distances are different")
else:
    print("We can't reject the null hypothesis")