<a href='https://kritikseth.github.io/ipynbtagredirect' target='_parent'><img src='https://raw.githack.com/kritikseth/kritikseth/master/assets/icons/kritik_ipynbtagredirect.svg' alt='Kritik Seth'/></a>

In [1]:
!pip install swachhdata

In [2]:
import swachhdata.text as sdt

[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Unzipping taggers/averaged_perceptron_tagger.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Unzipping corpora/wordnet.zip.
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


In [3]:
import pandas as pd
import numpy as np
from prettytable import PrettyTable

import scipy as sp
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import normalize, OneHotEncoder
from sklearn.decomposition import TruncatedSVD
from sklearn.model_selection import train_test_split
from sklearn.multioutput import RegressorChain

from sklearn.model_selection import RandomizedSearchCV
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.metrics import r2_score, mean_squared_error
from sklearn.multiclass import OneVsRestClassifier
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import SGDRegressor
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import Ridge
from sklearn import linear_model
from sklearn.svm import LinearSVR

In [4]:
!wget -O train.csv.zip https://github.com/pranavn91/blockchain/blob/master/train.csv.zip?raw=true
!wget -O test.csv.zip https://github.com/pranavn91/blockchain/blob/master/test.csv.zip?raw=true
!unzip train.csv.zip
!unzip test.csv.zip

In [5]:
train = pd.read_csv('train.csv', error_bad_lines=False)
test = pd.read_csv('test.csv', error_bad_lines=False)

In [6]:
train.shape, test.shape

((77946, 28), (42157, 4))

In [7]:
target_cols = train.columns.difference(test.columns)
target = train[target_cols]
train.drop(target_cols, axis=1, inplace=True)

In [8]:
train.head(3)

Unnamed: 0,id,tweet,state,location
0,1,Jazz for a Rainy Afternoon: {link},oklahoma,Oklahoma
1,2,RT: @mention: I love rainy days.,florida,Miami-Ft. Lauderdale
2,3,Good Morning Chicago! Time to kick the Windy C...,idaho,


In [9]:
test.head(3)

Unnamed: 0,id,tweet,state,location
0,4,Edinburgh peeps is it sunny?? #weather,,birmingham
1,5,"SEEVERE T’STORM WARNING FOR TROUSDALE, NORTHW...",,Nashville
2,7,@Agilis1 sport or traditional climbing? Thats ...,,Midwest


In [10]:
target.head(3)

Unnamed: 0,k1,k10,k11,k12,k13,k14,k15,k2,k3,k4,k5,k6,k7,k8,k9,s1,s2,s3,s4,s5,w1,w2,w3,w4
0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.8,0.0,0.2,0.0
1,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.196,0.0,0.804,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0


In [11]:
for data in (train, test):
    missing_loc_idx = data.index[data.location.isnull()].tolist()
    missing_loc_dict = {}
    for i in range(len(data)):
        if i in missing_loc_idx:
            missing_loc_dict[i] = 1
        else:
            missing_loc_dict[i] = 0
    data['missing_loc'] = data.index.map(missing_loc_dict)

for data in (train,test):
    missing_state_idx = data.index[data.state.isnull()].tolist()
    missing_state_dict = {}
    for i in range(len(data)):
        if i in missing_state_idx:
            missing_state_dict[i] = 1
        else:
            missing_state_dict[i] = 0
    data['missing_state']= data.index.map(missing_state_dict)

In [12]:
for data in (train,test):
    data['location'] = data['location'].replace(np.nan, '', regex=True)
    data['state'] = data['state'].replace(np.nan, '', regex=True)
    data['full_text'] = data['tweet'] + ' ' + data['state'] + ' ' + data['location']

In [13]:
train_text = train[['full_text']].values.tolist()
train_text = [item for sublist in train_text for item in sublist]
test_text = test[['full_text']].values.tolist()
test_text = [item for sublist in test_text for item in sublist]

In [14]:
setup = sdt.TextSetup(train_text)
st = sdt.SwachhText(setup)
ssb = sdt.SwachhSabText(st)

train_text = ssb.clean_text(rem_url = True, rem_html = True, rem_linebreak = True, rem_mention = True,
                            exp_contraction = True, lower_case = True, rem_emoji = True, rem_acc_char = True,
                            rem_non_ascii_char = True, rem_num = True, rem_hashtag = True, rem_punct = True,
                            keep_alpha = True, rem_stop_word = True, lemmatization = True, verbose = True)

Initializing...

Removing URLs...
Removing HTML tags...
Removing breaks...
Removing mentions...
Expanding conractions...
Removing emojis...
Removing accented characters...
Removing non ASCII characters...
Removing numbers...
Removing hashtags...
Removing punctuations...
Filtering out non alphabets...
Removing stop words...
Lemmatizing...

Complete!



In [15]:
setup = sdt.TextSetup(test_text)
st = sdt.SwachhText(setup)
ssb = sdt.SwachhSabText(st)

test_text = ssb.clean_text(rem_url = True, rem_html = True, rem_linebreak = True, rem_mention = True,
                           exp_contraction = True, lower_case = True, rem_emoji = True, rem_acc_char = True,
                           rem_non_ascii_char = True, rem_num = True, rem_hashtag = True, rem_punct = True,
                           keep_alpha = True, rem_stop_word = True, lemmatization = True, verbose = True)

Initializing...

Removing URLs...
Removing HTML tags...
Removing breaks...
Removing mentions...
Expanding conractions...
Removing emojis...
Removing accented characters...
Removing non ASCII characters...
Removing numbers...
Removing hashtags...
Removing punctuations...
Filtering out non alphabets...
Removing stop words...
Lemmatizing...

Complete!



In [16]:
train['clean_text'] = train_text
test['clean_text'] = test_text

In [17]:
vectorizer = TfidfVectorizer(encoding='utf-8', ngram_range=(1, 3), max_df=0.75, min_df=5)

train_vec = vectorizer.fit_transform(train.clean_text)
test_vec = vectorizer.transform(test.clean_text)

svd = TruncatedSVD(n_components=2, algorithm='randomized', n_iter=5, random_state=None, tol=0.0)
train_svd = svd.fit_transform(train_vec)
test_svd = svd.transform(test_vec)

In [18]:
col = ['svd1', 'svd2']
svd_train =  pd.DataFrame(train_svd, columns=col)
svd_test = pd.DataFrame(test_svd, columns=col)

In [19]:
ohe = OneHotEncoder()
vectorizer2 = TfidfVectorizer(encoding='utf-8', ngram_range=(1, 3), max_df=0.75, min_df=5, max_features=50000)

In [20]:
X = sp.sparse.hstack((vectorizer2.fit_transform(train.clean_text), svd_train[['svd1','svd2']]), format='csr')
X_cols = vectorizer.get_feature_names() + svd_train[['svd1','svd2']].columns.tolist()

In [21]:
test_sp = sp.sparse.hstack((vectorizer2.transform(test.clean_text), svd_test[['svd1','svd2']]), format='csr')
test_cols = vectorizer.get_feature_names() + svd_test[['svd1','svd2']].columns.tolist()

In [22]:
X_train, X_test, y_train, y_test = train_test_split(X, target, test_size=0.2, random_state=0)

In [23]:
cols = target.columns

clf = Ridge(random_state=25, alpha=0.9982890050528583)

table = PrettyTable()
table.field_names = ['Column Name', 'MSE', 'RMSE', 'R2 Score']

for col in cols:
    clf.fit(X_train, y_train[col])
    mse = mean_squared_error(y_test[col], clf.predict(X_test))
    rmse = mean_squared_error(y_test[col], clf.predict(X_test), squared=False)
    r2 = r2_score(y_test[col], clf.predict(X_test))
    table.add_row([col, f'{mse:.3f}', f'{rmse:.3f}', f'{r2:.3f}'])

print(table)

+-------------+-------+-------+----------+
| Column Name |  MSE  |  RMSE | R2 Score |
+-------------+-------+-------+----------+
|      k1     | 0.005 | 0.070 |  0.692   |
|     k10     | 0.017 | 0.131 |  0.798   |
|     k11     | 0.007 | 0.085 |  0.783   |
|     k12     | 0.018 | 0.134 |  0.844   |
|     k13     | 0.024 | 0.154 |  0.776   |
|     k14     | 0.003 | 0.056 |  0.786   |
|     k15     | 0.007 | 0.084 |  0.816   |
|      k2     | 0.022 | 0.150 |  0.701   |
|      k3     | 0.002 | 0.046 |  0.293   |
|      k4     | 0.025 | 0.159 |  0.697   |
|      k5     | 0.005 | 0.070 |  0.812   |
|      k6     | 0.000 | 0.022 |  0.578   |
|      k7     | 0.058 | 0.242 |  0.606   |
|      k8     | 0.001 | 0.033 |  0.310   |
|      k9     | 0.021 | 0.146 |  0.161   |
|      s1     | 0.013 | 0.115 |  0.033   |
|      s2     | 0.061 | 0.247 |  0.491   |
|      s3     | 0.048 | 0.219 |  0.603   |
|      s4     | 0.054 | 0.233 |  0.584   |
|      s5     | 0.044 | 0.209 |  0.380   |
|      w1  