In [1]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.pipeline import make_pipeline
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import classification_report
import re
import os

In [2]:
#read data
data = pd.read_csv('Data/merged_data.csv')

In [3]:
#to get just the value from timezone
def extract_timezone(text):
    regex = r'^([^/]+)'
    match = re.search(regex, text)
    if match:
        return match.group(0)
    else:
        return None
data['timezone'] = data['timezone'].apply(extract_timezone)

In [4]:
def extract_coordinates(txt):
    # Define patterns to extract latitude and longitude
    lat_pattern = r'\'lat\'\:\s*([-+]?\d+\.\d+)' # Pattern for latitude
    lon_pattern = r'\'lon\'\:\s*([-+]?\d+\.\d+)'  # Pattern for longitude
    
    # Find latitude and longitude using regular expressions
    latitude = re.search(lat_pattern, txt)
    longitude = re.search(lon_pattern, txt)

    if latitude and longitude:  # If both latitude and longitude are found
        return pd.Series([float(latitude.group(1)), float(longitude.group(1))], index=['Latitude', 'Longitude'])
    else:
        return pd.Series([None, None], index=['Latitude', 'Longitude'])  # Return None for missing values


In [5]:
data[['Latitude', 'Longitude']] = data['coordinates'].apply(lambda x: extract_coordinates(x))

  data[['Latitude', 'Longitude']] = data['coordinates'].apply(lambda x: extract_coordinates(x))


In [6]:
data_encoded = pd.get_dummies(data, columns=['timezone'])

In [7]:
def get_features_target():
    
    # Select features and target variable   
    X = data_encoded.drop(columns=['geoname_id', 'dem' ,'area_name','feature_code','coordinates', 'Country Code', 'Number of Arrivals', 'Number of Departures', 'Country Name', 'modification_date', 'Expenditures'],axis=1)
    y = data_encoded['Country Name']

    return X, y

In [8]:
X, y = get_features_target()
X

Unnamed: 0,population,elevation,Year,Unemployment rate,Latitude,Longitude,timezone_Africa,timezone_America,timezone_Asia,timezone_Atlantic,timezone_Europe,timezone_Pacific
0,3253,620.0,2013,4.75,47.24496,8.53299,False,False,False,False,True,False
1,5000,680.0,2013,4.75,46.31667,7.98333,False,False,False,False,True,False
2,1100,350.0,2013,4.75,46.19057,6.04287,False,False,False,False,True,False
3,1316,418.0,2013,4.75,47.70797,8.80949,False,False,False,False,True,False
4,1557,903.0,2013,4.75,47.14888,9.14233,False,False,False,False,True,False
...,...,...,...,...,...,...,...,...,...,...,...,...
24070,16073,7.0,2017,10.80,18.42745,-67.15407,False,True,False,False,False,False
24071,1089,96.0,2010,6.97,52.15678,9.81265,False,False,False,False,True,False
24072,76957,0.0,2017,4.16,47.48938,19.07292,False,False,False,False,True,False
24073,7732,468.0,2015,4.70,11.90518,-86.09446,False,True,False,False,False,False


In [9]:
#test train split
X_train, X_valid, y_train, y_valid = train_test_split(X,y, test_size=0.2, random_state=42)

In [10]:
def defineModels():
    model_knn_neighbours = make_pipeline(
                       StandardScaler(),
                       KNeighborsClassifier(n_neighbors=13))

    model_random_forest = make_pipeline(
        StandardScaler(),
        RandomForestClassifier(n_estimators=150,max_depth=12,min_samples_leaf=4))
    
    model_gradient_boosting = make_pipeline(
        StandardScaler(),
        GradientBoostingClassifier(n_estimators=50, max_depth=2,min_samples_leaf=0.5))

    return model_knn_neighbours, model_random_forest, model_gradient_boosting 

In [11]:
model_knn_neighbours, model_random_forest, model_gradient_boosting  = defineModels()

In [12]:
# K nearest Neighbours
model_knn_neighbours.fit(X_train, y_train)

#Printing model accuracy
print("Results for K nearest Neighbours:")
print("Training Score", model_knn_neighbours.score(X_train,y_train))
print("Accuracy score of Model", model_knn_neighbours.score(X_valid,y_valid))

#Truth vs Prediction
df_knn_neighbours = pd.DataFrame({'truth': y_valid, 'prediction': model_knn_neighbours.predict(X_valid)})
print("Number of Incorrect predictions:" , len(df_knn_neighbours[df_knn_neighbours['truth'] != df_knn_neighbours['prediction']]))

#classification_report
print(classification_report(y_valid,model_knn_neighbours.predict(X_valid),zero_division=0))

Results for K nearest Neighbours:
Training Score 0.9919003115264797
Accuracy score of Model 0.9889927310488058
Number of Incorrect predictions: 53
                precision    recall  f1-score   support

     Argentina       1.00      1.00      1.00         1
       Austria       1.00      0.83      0.91        12
       Belarus       0.00      0.00      0.00         1
       Belgium       0.00      0.00      0.00         1
        Brazil       1.00      1.00      1.00         3
      Bulgaria       0.00      0.00      0.00         2
        Canada       0.00      0.00      0.00         3
      Colombia       0.00      0.00      0.00         2
    Costa Rica       0.00      0.00      0.00         1
Czech Republic       0.50      0.50      0.50         2
       Estonia       0.00      0.00      0.00         1
       Finland       0.00      0.00      0.00         0
        France       0.89      0.67      0.76        12
       Germany       0.57      0.72      0.63        18
        Gree

In [13]:
#random forest results
model_random_forest.fit(X_train, y_train)

#Printing model accuracy
print("Results for Random Forest Classifier:")
print("Training Score", model_random_forest.score(X_train,y_train))
print("Accuracy score of Model",model_random_forest.score(X_valid,y_valid))

#Truth vs Prediction
df_random_forest = pd.DataFrame({'truth': y_valid, 'prediction': model_random_forest.predict(X_valid)})
print("Number of Incorrect predictions RandomForest Model :" , len(df_random_forest[df_random_forest['truth'] != df_random_forest['prediction']]))

#classification_report
print(classification_report(y_valid,model_random_forest.predict(X_valid), zero_division=0))

Results for Random Forest Classifier:
Training Score 0.9982866043613707
Accuracy score of Model 0.9952232606438214
Number of Incorrect predictions RandomForest Model : 23
                    precision    recall  f1-score   support

         Argentina       1.00      1.00      1.00         1
           Austria       0.92      1.00      0.96        12
           Belarus       0.00      0.00      0.00         1
           Belgium       0.00      0.00      0.00         1
            Brazil       1.00      1.00      1.00         3
          Bulgaria       0.00      0.00      0.00         2
            Canada       0.00      0.00      0.00         3
          Colombia       0.00      0.00      0.00         2
        Costa Rica       0.00      0.00      0.00         1
    Czech Republic       1.00      1.00      1.00         2
           Estonia       0.00      0.00      0.00         1
            France       1.00      0.92      0.96        12
           Germany       0.78      1.00      0.8

In [14]:
#It takes about 30 seconds to run

#Gradient Boosting 
model_gradient_boosting.fit(X_train, y_train)

#Printing model accuracy
print("Results for Gradient boosting Classifier:")
print("Training Score", model_gradient_boosting.score(X_train,y_train))
print("Accuracy score of Model", model_gradient_boosting.score(X_valid,y_valid))

#Truth vs Prediction
df_gradient_boosting = pd.DataFrame({'truth': y_valid, 'prediction': model_gradient_boosting.predict(X_valid)})
print("Number of Incorrect predictions:" , len(df_gradient_boosting[df_gradient_boosting['truth'] != df_gradient_boosting['prediction']]))

#classification_report
print(classification_report(y_valid,model_gradient_boosting.predict(X_valid),zero_division=0))

Results for Gradient boosting Classifier:
Training Score 0.8199896157840083
Accuracy score of Model 0.818276220145379
Number of Incorrect predictions: 875
                precision    recall  f1-score   support

     Argentina       0.00      0.00      0.00         1
       Austria       0.00      0.00      0.00        12
       Belarus       0.00      0.00      0.00         1
       Belgium       0.00      0.00      0.00         1
        Brazil       0.00      0.00      0.00         3
      Bulgaria       0.00      0.00      0.00         2
        Canada       0.00      0.00      0.00         3
      Colombia       0.00      0.00      0.00         2
    Costa Rica       0.00      0.00      0.00         1
Czech Republic       0.00      0.00      0.00         2
       Estonia       0.00      0.00      0.00         1
        France       0.00      0.00      0.00        12
       Germany       0.00      0.00      0.00        18
        Greece       0.00      0.00      0.00         2
    

In [15]:
#as seeing above random foreset produces the best results
X_valid['Country Name'] = y_valid
X_valid['Predictions'] = df_random_forest['prediction']

In [16]:
#saving true vs precition values in resulting file for the comparison
path = 'Country_predictions/'

if not os.path.exists(path):
    os.makedirs(path)
        
X_valid.to_csv('country_predictions/validation_results.csv')