In [1]:
!pip3 install pandas
!pip3 install numpy
!pip3 install scikit-learn
!pip3 install scipy



In [4]:
import ast
import pickle
import pandas as pd
import numpy as np
from scipy.spatial import KDTree
from sklearn import preprocessing
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.linear_model import LogisticRegression

In [92]:
df_airline = pd.read_csv("airline_tweets.csv", engine='python')
df_cities = pd.read_csv("cities.csv")

## `CleanDataTask`

In [93]:
# Cleaning the airline dataframe

# Removing rows cantaining NAN
df_airline.dropna(subset=['tweet_coord'], inplace=True)
# df_airline.reset_index(drop=True, inplace= True)

# Removing rows containing '0.0, 0.0'
df_airline['tweet_coord'] = df_airline['tweet_coord'].astype(str)
df_airline.drop(df_airline[df_airline.tweet_coord == '[0.0, 0.0]'].index, inplace=True)
df_airline.reset_index(drop=True, inplace= True)

## `TrainingDataTask`

In [94]:
# Converting list as string to list
df_airline['tweet_coord'] = df_airline['tweet_coord'].apply(lambda x: ast.literal_eval(x))

# Dropping the dublicate values for the cities
df_cities = df_cities.drop_duplicates(subset='name', keep="first")
df_cities.reset_index(drop=True, inplace= True)

# combining the latitude and longitude columns of the dataframe
df_cities['latlong'] = df_cities.apply(lambda x: list([x['latitude'],x['longitude']]),axis=1)

### Nearest city logic

In [95]:
# Closest city sentiment 
s2 = np.array(list(df_cities['latlong']))
s1 = np.array(list(df_airline['tweet_coord']))

kdtree = KDTree(s1)
neighbours = kdtree.query(s2)

In [96]:
airline_sentiment = []
for i in neighbours[1]:
    airline_sentiment.append(df_airline['airline_sentiment'][i])

In [97]:
df_features = pd.DataFrame(list(zip(df_cities['name'], airline_sentiment)),
                           columns=['city_name', 'airline_sentiment'])

In [98]:
print(df_features.groupby(['airline_sentiment']).size())
df_features = df_features.drop(df_features[df_features['airline_sentiment'] == 'negative'].sample(frac=.54).index)
df_features = df_features.drop(df_features[df_features['airline_sentiment'] == 'neutral'].sample(frac=.50).index)
print(df_features.groupby(['airline_sentiment']).size())
df_features.reset_index(drop=True, inplace= True)

airline_sentiment
negative    9481
neutral     8534
positive    4147
dtype: int64
airline_sentiment
negative    4361
neutral     4267
positive    4147
dtype: int64


In [99]:
le = preprocessing.LabelEncoder()
df_features['airline_sentiment'] = le.fit_transform(df_features['airline_sentiment'])
df_features = pd.concat([df_features, pd.get_dummies(df_features['city_name'], prefix='city_name', drop_first=True)], axis=1)
df_features.drop(columns=['city_name'], inplace=True)

## `TrainingModelTask`

In [100]:
y = df_features.pop('airline_sentiment')
X = df_features

In [101]:
from sklearn.linear_model import LogisticRegression
model = LogisticRegression()
model.fit(X, y)

LogisticRegression()

In [102]:
# save the model to disk
filename = 'finalized_model.sav'
pickle.dump(model, open(filename, 'wb'))

# load the model from disk
loaded_model = pickle.load(open(filename, 'rb'))

## `ScoreTask`

In [103]:
# Reverse the pandas dummy
name = X.idxmax(axis=1)
city_name = list(map( lambda x: x.replace( 'city_name_', ''), name))

In [104]:
negative_probability = loaded_model.predict_proba(X)[:,0]
neutral_probability =loaded_model.predict_proba(X)[:,1]
positive_probability =loaded_model.predict_proba(X)[:,2]

In [105]:
df = pd.DataFrame(list(zip(city_name, negative_probability, neutral_probability, positive_probability)), 
                  columns =['city_name', 'negative_probability', 'neutral_probability', 'positive_probability']) 

In [106]:
df.sort_values('positive_probability', inplace=True, ascending=False)

In [107]:
df.head(1000)

Unnamed: 0,city_name,negative_probability,neutral_probability,positive_probability
6387,Moncalieri,0.250892,0.246308,0.5028
10944,Luts’k,0.250892,0.246308,0.5028
10936,Melitopol’,0.250892,0.246308,0.5028
10937,Mariupol,0.250892,0.246308,0.5028
10938,Marhanets’,0.250892,0.246308,0.5028
...,...,...,...,...
2954,Konz,0.250892,0.246308,0.5028
2963,Kerpen,0.250892,0.246308,0.5028
2962,Kevelaer,0.250892,0.246308,0.5028
12044,Hermitage,0.250892,0.246308,0.5028


In [108]:
len(set(df['city_name']))

12774

In [109]:
len(df['city_name'])

12775