In [1]:
!pip3 install pandas
!pip3 install numpy
!pip3 install scikit-learn
!pip3 install scipy



In [4]:
import ast
import pickle
import pandas as pd
import numpy as np
from scipy.spatial import KDTree
from sklearn import preprocessing
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.linear_model import LogisticRegression

In [33]:
df_airline = pd.read_csv("airline_tweets.csv", engine='python')
df_cities = pd.read_csv("cities.csv")

## `CleanDataTask`

In [34]:
# Cleaning the airline dataframe

# Removing rows cantaining NAN
df_airline.dropna(subset=['tweet_coord'], inplace=True)
# df_airline.reset_index(drop=True, inplace= True)

# Removing rows containing '0.0, 0.0'
df_airline['tweet_coord'] = df_airline['tweet_coord'].astype(str)
df_airline.drop(df_airline[df_airline.tweet_coord == '[0.0, 0.0]'].index, inplace=True)
df_airline.reset_index(drop=True, inplace= True)

## `TrainingDataTask`

In [35]:
# Converting list as string to list
df_airline['tweet_coord'] = df_airline['tweet_coord'].apply(lambda x: ast.literal_eval(x))

# Dropping the dublicate values for the cities
df_cities = df_cities.drop_duplicates(subset='name', keep="first")
df_cities.reset_index(drop=True, inplace= True)

# combining the latitude and longitude columns of the dataframe
df_cities['latlong'] = df_cities.apply(lambda x: list([x['latitude'],x['longitude']]),axis=1)

### Nearest city logic

In [36]:
# Closest city sentiment 
s2 = np.array(list(df_cities['latlong']))
s1 = np.array(list(df_airline['tweet_coord']))

kdtree = KDTree(s1)
neighbours = kdtree.query(s2)

In [37]:
airline_sentiment = []
for i in neighbours[1]:
    airline_sentiment.append(df_airline['airline_sentiment'][i])

In [38]:
df_features = pd.DataFrame(list(zip(df_cities['name'], airline_sentiment)),
                           columns=['city_name', 'airline_sentiment'])

In [39]:
print(df_features.groupby(['airline_sentiment']).size())
df_features = df_features.drop(df_features[df_features['airline_sentiment'] == 'negative'].sample(frac=.54).index)
df_features = df_features.drop(df_features[df_features['airline_sentiment'] == 'neutral'].sample(frac=.50).index)
print(df_features.groupby(['airline_sentiment']).size())
df_features.reset_index(drop=True, inplace= True)

airline_sentiment
negative    9481
neutral     8534
positive    4147
dtype: int64
airline_sentiment
negative    4361
neutral     4267
positive    4147
dtype: int64


In [40]:
le = preprocessing.LabelEncoder()
df_features['airline_sentiment'] = le.fit_transform(df_features['airline_sentiment'])
df_features = pd.concat([df_features, pd.get_dummies(df_features['city_name'], prefix='city_name', drop_first=True)], axis=1)
df_features.drop(columns=['city_name'], inplace=True)

## `TrainingModelTask`

In [42]:
y = df_features.pop('airline_sentiment')
X = df_features

In [43]:
from sklearn.linear_model import LogisticRegression
model = LogisticRegression()
model.fit(X, y)

LogisticRegression()

In [64]:
# save the model to disk
filename = 'finalized_model.sav'
pickle.dump(model, open(filename, 'wb'))

# load the model from disk
loaded_model = pickle.load(open(filename, 'rb'))

## `ScoreTask`

In [65]:
# Reverse the pandas dummy
name = X.idxmax(axis=1)
city_name = list(map( lambda x: x.replace( 'city_name_', ''), name))

In [66]:
negative_probability = loaded_model.predict_proba(X)[:,0]
neutral_probability =loaded_model.predict_proba(X)[:,1]
positive_probability =loaded_model.predict_proba(X)[:,2]

In [67]:
df = pd.DataFrame(list(zip(city_name, negative_probability, neutral_probability, positive_probability)), 
                  columns =['city_name', 'negative_probability', 'neutral_probability', 'positive_probability']) 

In [68]:
df.sort_values('positive_probability', inplace=True, ascending=False)

In [69]:
df.head(1000)

Unnamed: 0,city_name,negative_probability,neutral_probability,positive_probability
9353,Skopin,0.250874,0.246325,0.502801
12079,Murrysville,0.250874,0.246325,0.502801
2915,Leichlingen,0.250874,0.246325,0.502801
7916,Jalingo,0.250874,0.246325,0.502801
2918,Lebach,0.250874,0.246325,0.502801
...,...,...,...,...
2520,Remedios,0.250874,0.246325,0.502801
2521,Regla,0.250874,0.246325,0.502801
2522,Ranchuelo,0.250874,0.246325,0.502801
2524,Placetas,0.250874,0.246325,0.502801
