In [1]:
import os
import pandas as pd
import numpy as np
from nltk.corpus import stopwords
from sklearn.feature_extraction.text import TfidfVectorizer
import keras
from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics import mean_squared_error
from keras.models import Sequential
from keras.layers import Dense
from keras.layers import Dropout
from keras.layers import Input
from keras.models import Model
from sklearn.metrics import accuracy_score
from keras.utils import to_categorical
import gc

In [22]:
def remove_punctuations(x):
    pun = '''!()-[]{};:'""\,<>.+=`/?@’”#$%^&£*�_“‚…‘‘•~-—–1234567890'''
    for i in range(len(x)):
        x[i] = x[i].lower()
        if('\n' in x[i]):
            x[i] = x[i].replace('\n',' ')
        for exp in pun:
            if exp in x[i]:
                x[i] = x[i].replace(exp,'')
        if(len(x[i])>200):
            x[i] = x[i][:x[i].find(' ',200)]
    return x

def get_real_y(y):
    ny = pd.DataFrame(y)
    ny[ny == 'foreignpolicy'] = 0
    ny[ny == 'nationalpol'] = 1
    ny[ny == 'personal'] = 2
    ny[ny == 'selfpolitics'] = 3
    ny[ny == 'statepol'] = 4
    ny[ny == 'indeterminate'] = -1     
    print(ny)
    return ny
def get_class(c):
    if(c == 0):
        return 'foreignpolicy'
    elif(c == 1):
        return 'nationalpol'
    elif(c == 2):
        return 'personal'
    elif(c == 3):
        return 'selfpolitics'
    else:
        return 'statepol'
    

def get_train_test_data(x,y):
    neg_ind = np.where(y==-1)[0]
    non_neg = np.where(y!=-1)[0]
    test_x = x[neg_ind]
    test_y = y[neg_ind]
    train_x = x[non_neg]
    train_y = y[non_neg]
    return train_x,train_y,test_x,test_y

def nFold(samples,labels,n,i):
    l = len(samples)
    start = int(l/n)*i
    end = int(l/n)*(i+1)
    return np.concatenate((samples[0:start],samples[end:l]),axis = 0),np.concatenate((labels[0:start],labels[end:l]),axis = None),samples[start:end],labels[start:end]    

In [3]:
col = ['text','typegeneral']
df = pd.read_feather('../input/csh-jt-intern/dataframeX',columns = col)

In [4]:
x,y = list(df['text']),list(df['typegeneral'])
x = remove_punctuations(x)
y = get_real_y(y)
y = np.array(y[0])

        0
0       3
1       2
2       2
3       2
4       2
...    ..
692537  1
692538  1
692539  1
692540  1
692541  1

[692542 rows x 1 columns]


In [5]:
mf = 1000
vectorizer = TfidfVectorizer(stop_words = 'english',max_features = mf)
x = vectorizer.fit_transform(x)

In [8]:
train_x,train_y,test_x,test_y = get_train_test_data(x,y)

In [38]:
del train_x,train_y
gc.collect()

22

In [9]:
train_x = train_x.toarray().astype(np.float32)
train_y = to_categorical(train_y).astype(np.float32)
test_x = test_x.toarray().astype(np.float32)

In [10]:
test_x.shape

(335559, 1000)

In [11]:
model=Sequential()
model.add(Dense(100,input_shape=(1,mf)))
model.add(Dropout(0.2))
model.add(Dense(1000))
model.add(Dropout(0.2))
model.add(Dense(100))
model.add(Dense(5,activation="softmax"))
model.compile(loss="categorical_crossentropy", optimizer="adam", metrics=["accuracy"])
model.summary()

Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
dense (Dense)                (None, 1, 100)            100100    
_________________________________________________________________
dropout (Dropout)            (None, 1, 100)            0         
_________________________________________________________________
dense_1 (Dense)              (None, 1, 1000)           101000    
_________________________________________________________________
dropout_1 (Dropout)          (None, 1, 1000)           0         
_________________________________________________________________
dense_2 (Dense)              (None, 1, 100)            100100    
_________________________________________________________________
dense_3 (Dense)              (None, 1, 5)              505       
Total params: 301,705
Trainable params: 301,705
Non-trainable params: 0
__________________________________________________

In [33]:
train_y

array([[0., 0., 0., 1., 0.],
       [0., 0., 1., 0., 0.],
       [0., 0., 1., 0., 0.],
       ...,
       [0., 1., 0., 0., 0.],
       [0., 1., 0., 0., 0.],
       [0., 1., 0., 0., 0.]], dtype=float32)

In [12]:
model.fit(train_x,train_y,batch_size = 32, epochs = 1,verbose = 1)



<tensorflow.python.keras.callbacks.History at 0x7fa6a043d190>

In [36]:
#Testing
model.fit(train_x[:100000],train_y[:100000],batch_size = 32, epochs = 1,verbose = 1,validation_data=(train_x[100000:200000], train_y[100000:200000]))



<tensorflow.python.keras.callbacks.History at 0x7ff2462fb9d0>

In [13]:
pred = model.predict(test_x)

In [14]:
pred_classes = pred.argmax(axis = -1)

In [18]:
print(len(pred_classes))
print(len(test_x))
print(len(df[df['typegeneral']=='indeterminate']))

335559
335559
335559


In [35]:
predicted_class = []
predicted_prob = []
j = 0
for i in df['typegeneral']:
    if(i =='indeterminate'):
        c = get_class(predicted_class[j])
        p = max(pred[j])
        j+=1
    else:
        c = i
        p = 1
    predicted_class.append(c)
    predicted_prob.append(p)

In [42]:
data = {
    'Predicted_Class':predicted_class,
    'Probability':predicted_prob
}

In [43]:
dfn = pd.DataFrame(data)

In [47]:
dfn.to_csv('results.csv',index = False)

In [48]:
pp = np.array(predicted_prob)

In [49]:
print(np.max(pp))
print(np.min(pp))
print(np.mean(pp))
print(np.var(pp))

1.0
0.22524723410606384
0.8848673352861062
0.03314284281941712
