**CNN-LSTM MODEL FOR REGRESSION USING STACK OVERFLOW DATASET**

In [None]:
from google.colab import drive
drive.mount('/content/drive')
project_folder = "./drive/My Drive/csc2515-project/"

Mounted at /content/drive


In [None]:
import pandas as pd
import numpy as np
data = pd.read_csv('/content/drive/My Drive/csc2515-project/Train.csv',  encoding='Latin-1')

data['Labels'] = data.groupby(['QId'])['Score'].transform(max) == data['Score']
data['Best_Score'] = data[['Labels']] * 1

# Filter out any questions with more than one answer with the maximum score
data = data.groupby('QId').filter(lambda x: x.nlargest(2,'Score')['Score'].iloc[1]!=x.nlargest(2,'Score')['Score'].iloc[0])
# Reset indices
data.reset_index(drop=True, inplace=True)
# Input list
# Y = data['Best_Score'].values.tolist()
X = data.drop(['Score','QId', 'Labels', 'Best_Score', 'Clean_Question', 'Clean_Answer', 'QAskerId', 'OwnerUserId'], axis=1).values.tolist()
# Normalize scores
normalized_scores = data.groupby('QId')['Score'].apply(lambda x: x / x.max())

In [None]:
print(data.head(5).to_string())

   Score  QId  Before  After  Cosine_to_Question  Cosine_to_Answers  Word_cnt  Char_cnt  Avg_char_length  Urls  Codes  Grade_Level  Dale_chall  Reading_Ease  Polarity  Subjectivity  Cumulative_ Answer_Score  Num_Answers  Num_Questions  Average_Answer_Score  Q_Word_Cnt  Q_Char_Cnt  Q_Avg_char_length  Q_Urls  Q_CodeSections  Q_GradeLevel  Q_Dale_chall  Q_ReadingEase  Q_Polarity  Q_Subjectivity                                                                                                                                                                                                             Clean_Question                                                                                                                                                                                                                                                                                                                                                                                                   

In [None]:
from sklearn.preprocessing import StandardScaler
X = StandardScaler(with_mean=True).fit_transform(X)

In [None]:
# Find the split index, then iterate to next question since the the split should not be in the middle of a question
split_index = int(len(X) * 0.8)
split_Q = data['QId'].iloc[split_index]
while(split_index<len(X) and data['QId'].iloc[split_index] == split_Q):
  split_index += 1
  
Xtr = X[:split_index]
# Ytr = Y[:split_index]
Ytr = normalized_scores[:split_index]
Xte = X[split_index:]
# Yte = Y[split_index:]
Yte = normalized_scores[split_index:]

Xtr = np.asarray(Xtr)
Ytr = np.asarray(Ytr)
Xte = np.asarray(Xte)
Yte = np.asarray(Yte)

**TRAINING THE MODEL**

In [None]:
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers

X_train = np.expand_dims(Xtr, axis=2)
X_test = np.expand_dims(Xte, axis=2)

model = keras.Sequential()
model.add(layers.Conv1D(filters=64, kernel_size=3, activation='relu', input_shape=(30, 1)))
model.add(layers.Conv1D(filters=64, kernel_size=3, activation='relu', input_shape=(30, 1)))
model.add(layers.LSTM(128))
model.add(layers.Dense(1))

'''model.compile(loss='binary_crossentropy',
              optimizer='adam',
              metrics=['accuracy'])'''

model.compile(loss='mean_squared_error',
              optimizer='adam',
              metrics=['mean_squared_error'])

model.summary()

class_weight = {0: 1., 
                1: 1.,
                }

model.fit(X_train, Ytr, batch_size=32, epochs=12, validation_data=(X_test, Yte))

Model: "sequential_7"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
conv1d_14 (Conv1D)           (None, 28, 64)            256       
_________________________________________________________________
conv1d_15 (Conv1D)           (None, 26, 64)            12352     
_________________________________________________________________
lstm_7 (LSTM)                (None, 128)               98816     
_________________________________________________________________
dense_7 (Dense)              (None, 1)                 129       
Total params: 111,553
Trainable params: 111,553
Non-trainable params: 0
_________________________________________________________________
Epoch 1/12
Epoch 2/12
Epoch 3/12
Epoch 4/12
Epoch 5/12
Epoch 6/12
Epoch 7/12
Epoch 8/12
Epoch 9/12
Epoch 10/12
Epoch 11/12
Epoch 12/12


<tensorflow.python.keras.callbacks.History at 0x7f0e5c87b7b8>

**EVALUATE**

In [None]:
predictions = model.predict(X_test)
print(predictions)
print(Yte)

[[0.24351183]
 [0.38891512]
 [0.27911925]
 ...
 [0.2408066 ]
 [0.13713402]
 [0.10588131]]
[0.  1.  0.2 ... 0.  0.  0. ]


In [None]:
from sklearn.metrics import classification_report
from sklearn.metrics import mean_squared_error

# MSE
print('Mean squared error: %.2f'
    % mean_squared_error(Yte, predictions))

Mean squared error: 0.10


In [None]:
predictions = predictions.tolist()

In [None]:
predictions

[[0.24351182579994202],
 [0.38891512155532837],
 [0.27911925315856934],
 [0.2332509309053421],
 [0.16218538582324982],
 [0.10346757620573044],
 [0.11288310587406158],
 [0.07735898345708847],
 [0.3272777795791626],
 [0.256559818983078],
 [0.2463042438030243],
 [0.3223716914653778],
 [0.3741086721420288],
 [0.19730287790298462],
 [0.2009502500295639],
 [0.2769363522529602],
 [0.3793785572052002],
 [0.3600804805755615],
 [0.2729732096195221],
 [0.29712241888046265],
 [0.1330973207950592],
 [0.1723373830318451],
 [0.3612514138221741],
 [0.3332730829715729],
 [0.4273498058319092],
 [0.13638581335544586],
 [0.16226139664649963],
 [0.17699772119522095],
 [0.0891227126121521],
 [0.2420322597026825],
 [0.26210784912109375],
 [0.31521308422088623],
 [0.29625964164733887],
 [0.22422170639038086],
 [0.13781027495861053],
 [0.055810101330280304],
 [0.42145711183547974],
 [0.20705388486385345],
 [0.1631949543952942],
 [0.10680225491523743],
 [0.12118934094905853],
 [0.29703837633132935],
 [0.1522922

In [None]:
# Create a dataframe with QId and Predictions to determine the best answer prediction for each question
d = predictions
pred_df = pd.DataFrame(data = d)
pred_df = pred_df.rename(columns={0: 'Prediction'})
#print(pred_df)
#pred_df['Prediction'] = pd.DataFrame(data=d)
pred_df['QId'] = data['QId'].tolist()[split_index:]
#Get the index of the first best score prediction for each question
best_answers_pred = (pred_df.groupby(["QId"])["Prediction"].idxmax()).tolist()
test_data=data.iloc[split_index:]
test_data.reset_index(drop=True, inplace=True)
#For every question in the test set get the index of the best answer
best_answers_test = (test_data.groupby(["QId"])["Score"].idxmax()).tolist()

       Prediction
0        0.243512
1        0.388915
2        0.279119
3        0.233251
4        0.162185
...           ...
22318    0.321495
22319    0.179124
22320    0.240807
22321    0.137134
22322    0.105881

[22323 rows x 1 columns]


In [None]:
# Calculate accuracy
count = 0
correct = 0
for i in range(len(best_answers_test)):
  if best_answers_pred[i] == best_answers_test[i]:
    correct += 1
  count += 1
print(correct / count)

0.3736095128500192
