In [1]:
!pip install catboost
! pip install transformers
! pip install lightgbm
! pip install pytorch_pretrained_bert
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

# data prep
import nltk
from nltk.tokenize import word_tokenize
import os
from sklearn.model_selection import train_test_split

# model
import transformers as ppb 
import lightgbm as lgb
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import KFold
import catboost
import torch
from pytorch_pretrained_bert import BertTokenizer, BertConfig
from pytorch_pretrained_bert import  BertConfig, BertModel, BertForMaskedLM 
from pytorch_pretrained_bert.optimization import BertAdam, WarmupLinearSchedule




  import pandas.util.testing as tm


In [39]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [0]:
SEED = 43
np.random.seed(SEED)

columns = ["text", "parent_text", "score"]
df = pd.concat([
    pd.read_csv("/content/drive/My Drive/Kaggle/Twits/Data/comments_positive.csv", usecols=columns, na_filter=False),
    pd.read_csv("/content/drive/My Drive/Kaggle/Twits/Data/comments_negative.csv", usecols=columns, na_filter=False)
], ignore_index=True)

#df = df.rename(columns = {'score':'label'})

## Missing and duplicated values


In [0]:
df.isna().sum()

In [0]:
drop_rows = list(df[(df['text'].isin([' ']) == True)].index) + list(df[(df['parent_text'].isin([' ']) == True)].index) + \
 list(df[(df['text'].isin(['']) == True)].index) + list(df[(df['parent_text'].isin(['']) == True)].index)

a = df.groupby(by = ['text','parent_text']).count()
duplicates = list(a[a['score']>1].index)
len(duplicates)

In [0]:
same_scoe = []
different_score = []
for i in range(len(duplicates)):
  check = df[(df['text'] == duplicates[i][0]) & (df['parent_text'] == duplicates[i][1])]
  if all(check['score'] == check['score'].mean()): # some duplicatec could have the same scores so its i am not going to drop
    same_scoe.extend(list(check.index))
  else:
    different_score.extend(list(check.index))

In [0]:
to_drop = list(set(different_score + drop_rows))

In [0]:
len(to_drop)

In [0]:
with open('/content/drive/My Drive/Kaggle/Twits/Data/drop_rows.txt', 'w') as f:
    for item in to_drop:
        f.write(str(item) + '\n')

## Upload the indexes to drop and crate the cleaned dataset

In [0]:
drop_indexes = []
with open('/content/drive/My Drive/Kaggle/Twits/Data/drop_rows.txt', 'r') as f:
    for _ in range(10390):
        drop_indexes.append(float(f.readline().strip("\n")))

In [0]:
df = df.drop(drop_indexes)
df.reset_index(drop = True,inplace = True)


## The dataset


In [0]:
y = df['score']
df.drop(columns='score', inplace=True)

X_train, X_test, y_train, y_test = train_test_split(df, y, test_size=0.05, random_state=SEED)

#To be sure we don't use indices to predict something
X_train.reset_index(drop=True, inplace=True)
X_test.reset_index(drop=True, inplace=True)
y_train.reset_index(drop=True, inplace=True)
y_test.reset_index(drop=True, inplace=True)

print("Train shape: {}".format(X_train.shape))
print("Test shape: {}".format(X_test.shape))

X_train['score'] = y_train
X_test['score'] = y_test

## Prepare the Data with the BERT Model Time

### Data processor

In [0]:
class DataProcessor():
    def __init__(self,train,test,max_seq_length):
        self.train_df = train
        self.test_df = test
        self.tokenizer = BertTokenizer.from_pretrained('bert-base-uncased', do_lower_case=True)
        self.max_seq_length = max_seq_length
        
    def _tokenization(self,name):
        if name == 'train':
            df_data = self.train_df
            
        elif name == 'test':
            df_data = self.test_df
        texts = df_data.text
        all_tokens = []
       
        for text in texts:
            text = self.tokenizer.tokenize(text)
            text = text[:self.max_seq_length - 2]
            input_sequence = ['[CLS]'] + text + ['[SEP]']
            pad_len = self.max_seq_length - len(input_sequence)
            
            tokens = self.tokenizer.convert_tokens_to_ids(input_sequence)
            tokens += [0] * pad_len
            
            all_tokens.append(tokens)
           
        return np.array(all_tokens)
    def get_train_test_features(self):
        train_features = self._tokenization('train')
        test_features = self._tokenization('test')
        return train_features,test_features

Because i don't have quite powerfull machine to train the NN i decided to bootstrap the part of the dataset for train

In [0]:
def get_bootstrap_samples(data, n_samples):
    np.random.seed(42)
    indices = np.random.randint(0, len(data), n_samples)
    samples = data.loc[indices]
    return samples

n_samples = 1000000
X_train_bs = get_bootstrap_samples(X_train, n_samples).reset_index(drop = True)

In [0]:
print("Positive scores: ",np.sum(X_train_bs['score']>0))
print("Negative scores: ",np.sum(X_train_bs['score']<0))

In [0]:
model_class, tokenizer_class, pretrained_weights = (ppb.DistilBertModel, ppb.DistilBertTokenizer, 'distilbert-base-uncased')
model = model_class.from_pretrained(pretrained_weights) 

In [0]:
def get_emb(X_train_bs,X_test,feature,max_seq_length,model):
  
  X_train = X_train_bs[[feature,'score']].copy()
  X_test = X_test[[feature,'score']].copy() # for now i dont need the test
  
  if feature == 'parent_text':
    X_train = X_train.rename(columns = {'parent_text':'text'})
    X_test = X_test.rename(columns = {'parent_text':'text'})


  data_process = DataProcessor(X_train,X_test,max_seq_length)
  train_features,test_features = data_process.get_train_test_features()

  input_ids = torch.tensor([f for f in train_features])

  with torch.no_grad():
    last_hidden_states = model(input_ids)
    
  out = pd.DataFrame(last_hidden_states[0][:,0,:].numpy())
  out['Score'] = np.array(X_train['score'])
  return out




In [0]:
step = 1000
max_seq_length = 64
slices = np.arange(0,len(X_train_bs)+step,step)
features = pd.DataFrame()
feature = 'parent_text'
for i in range (600,len(slices)-1,1):
  print(slices[i],slices[i+1])
  if i == 0 :
    data = X_train_bs.loc[slices[i]:slices[i+1]].reset_index(drop = True)
  else: 
    data = X_train_bs.loc[slices[i]+1:slices[i+1]].reset_index(drop = True)
  
  out = get_emb(data,X_test.loc[:1],feature,max_seq_length,model)
  features = features.append(out).reset_index(drop = True)
  if slices[i+1] % 50000 == 0:
    features.to_csv("/content/drive/My Drive/Kaggle/Twits/Data/parent_text_" + str(slices[i+1]) + ".csv")
    features = pd.DataFrame()


In [0]:
step = 1000
max_seq_length = 64
slices = np.arange(0,len(X_train_bs)+step,step)
features = pd.DataFrame()
feature = 'text'
for i in range (50,len(slices)-1):
  print(slices[i],slices[i+1])
  if i == 0 :
    data = X_train_bs.loc[slices[i]:slices[i+1]].reset_index(drop = True)
  else: 
    data = X_train_bs.loc[slices[i]+1:slices[i+1]].reset_index(drop = True)
  
  out = get_emb(data,X_test.loc[:1],feature,max_seq_length,model)
  features = features.append(out).reset_index(drop = True)
  if slices[i+1] % 50000 == 0:
    features.to_csv("/content/drive/My Drive/Kaggle/Twits/Data/text_" + str(slices[i+1]) + ".csv")
    features = pd.DataFrame()


In [0]:
fr = 50000
to = 300000
step = 50000
parent_text = ["parent_text_" + str(i) + ".csv" for i in range(fr,to+1,step)]
text = ["text_" + str(i) + ".csv"  for i in range(fr,to+1,step)]

In [0]:
parent_data = pd.DataFrame()
for i in range(len(parent_text)):
  if i == 0:
    parent_data = pd.read_csv("/content/drive/My Drive/Kaggle/Twits/Data/" + parent_text[i])
  else:
    parent_data = parent_data.append(pd.read_csv("/content/drive/My Drive/Kaggle/Twits/Data/" + parent_text[i])).reset_index(drop = True)
parent_data.reset_index(drop = True,inplace = True)
parent_data.drop(columns = 'Unnamed: 0',inplace = True)

In [0]:
for i in range(len(text)):
  if i == 0:
    text_data = pd.read_csv("/content/drive/My Drive/Kaggle/Twits/Data/" + text[i])
  else:
    text_data = text_data.append(pd.read_csv("/content/drive/My Drive/Kaggle/Twits/Data/" + text[i]), error_bad_lines=False).reset_index(drop = True)
 
text_data.reset_index(drop = True,inplace = True)
text_data.drop(columns = 'Unnamed: 0',inplace = True)

In [0]:
print(sum(parent_data['Score'] == text_data['Score']))

300001


In [0]:
parent_data = parent_data.add_suffix('_parent')
text_data = text_data.add_suffix('_text')
data = pd.concat([parent_data, text_data], axis=1)

In [0]:
del parent_data,text_data

In [0]:
data.to_csv('/content/drive/My Drive/Kaggle/Twits/Data/prep_data2.csv')

In [0]:
del data

In [0]:
step = 1000
max_seq_length = 64
slices = np.arange(0,len(X_test)+step,step)
features = pd.DataFrame()
feature = 'parent_text'
for i in range (0,len(slices)-1,1):
  print(slices[i],slices[i+1])
  if i == 0 :
    data = X_test.loc[slices[i]:slices[i+1]].reset_index(drop = True)
  else: 
    data = X_test.loc[slices[i]+1:slices[i+1]].reset_index(drop = True)
  
  out = get_emb(data,data.loc[:1],feature,max_seq_length,model)
  features = features.append(out).reset_index(drop = True)
  if slices[i+1] % 50000 == 0 or slices[i+1]>199000:
    features.to_csv("/content/drive/My Drive/Kaggle/Twits/Data/test_parent_text_" + str(slices[i+1]) + ".csv")
    features = pd.DataFrame()

In [0]:
step = 1000
max_seq_length = 64
slices = np.arange(0,len(X_test)+step,step)
features = pd.DataFrame()
feature = 'text'
for i in range (0,len(slices)-1,1):
  print(slices[i],slices[i+1])
  if i == 0 :
    data = X_test.loc[slices[i]:slices[i+1]].reset_index(drop = True)
  else: 
    data = X_test.loc[slices[i]+1:slices[i+1]].reset_index(drop = True)
  
  out = get_emb(data,data.loc[:1],feature,max_seq_length,model)
  features = features.append(out).reset_index(drop = True)
  if slices[i+1] % 50000 == 0 or slices[i+1]>199000 :
    features.to_csv("/content/drive/My Drive/Kaggle/Twits/Data/test_text_" + str(slices[i+1]) + ".csv")
    features = pd.DataFrame()

In [0]:
fr = 50000
to = 200000
step = 50000
parent_text = ["test_parent_text_" + str(i) + ".csv" for i in range(fr,to+1,step)]
text = ["test_text_" + str(i) + ".csv"  for i in range(fr,to+1,step)]

In [0]:
parent_data = pd.DataFrame()
for i in range(len(parent_text)):
  if i == 0:
    parent_data = pd.read_csv("/content/drive/My Drive/Kaggle/Twits/Data/" + parent_text[i])
  else:
    parent_data = parent_data.append(pd.read_csv("/content/drive/My Drive/Kaggle/Twits/Data/" + parent_text[i])).reset_index(drop = True)
parent_data.reset_index(drop = True,inplace = True)
parent_data.drop(columns = 'Unnamed: 0',inplace = True)

text_data = pd.DataFrame()
for i in range(len(text)):
  if i == 0:
    text_data = pd.read_csv("/content/drive/My Drive/Kaggle/Twits/Data/" + text[i])
  else:
    text_data = text_data.append(pd.read_csv("/content/drive/My Drive/Kaggle/Twits/Data/" + text[i])).reset_index(drop = True)
 
text_data.reset_index(drop = True,inplace = True)
text_data.drop(columns = 'Unnamed: 0',inplace = True)

In [0]:
print(sum(parent_data['Score'] == text_data['Score']))

199481


In [0]:
parent_data = parent_data.add_suffix('_parent')
text_data = text_data.add_suffix('_text')
data = pd.concat([parent_data, text_data], axis=1)

In [0]:
del parent_data,text_data

In [0]:
data.to_csv('/content/drive/My Drive/Kaggle/Twits/Data/prep_test_data2.csv')

In [0]:
del data

##Model

In [0]:
df_train = pd.read_csv('/content/drive/My Drive/Kaggle/Twits/Data/prep_data.csv',engine = None).iloc[:300000]
df_test = pd.read_csv('/content/drive/My Drive/Kaggle/Twits/Data/prep_test_data.csv',engine = None)

df_train.drop(columns = "Unnamed: 0",inplace = True)
df_train.drop(columns = 'Score_text',inplace = True)

df_test.drop(columns = "Unnamed: 0",inplace = True)
df_test.drop(columns = 'Score_text',inplace = True)


In [0]:
df_train = df_train.rename(columns = {'Score_parent':"Score"})
df_test = df_test.rename(columns = {'Score_parent':"Score"})

y = df_train['Score']
df_train.drop(columns = 'Score',inplace = True)

y_test = df_test["Score"]
df_test.drop(columns = 'Score',inplace = True)

In [0]:

def model_lgb(df_train, y,df_test, random_state = 123, n_splits = 5):
    
    
    params = {
            'boosting_type': 'gbdt',
            'objective': 'regression',
            'metric': {'l2'},
            'num_leaves': 31,
            'learning_rate': 0.1,
            'feature_fraction': 0.9,
            'bagging_fraction': 0.8,
            'bagging_freq': 5,
            'n_estimators': 100,
            'verbose':-1}      
    #  Init predictions array with all zeroes
    predictions = np.zeros(len(df_test))
    
    
    mse_score = 0

    print('Starting cross validation')

    kf = KFold(n_splits=n_splits, shuffle = True, random_state = random_state)

    for train_index, test_index in kf.split(df_train):

        lgb_train = lgb.Dataset(df_train.iloc[train_index], y[train_index])
        lgb_eval = lgb.Dataset(df_train.iloc[test_index], y[test_index], reference=lgb_train)
        
        gbm = lgb.train(params,
                lgb_train,
                valid_sets=lgb_eval,
                num_boost_round = 1000,
                early_stopping_rounds = 20,
                verbose_eval=400,

                )
        print('Starting predicting...')
        predictions += gbm.predict(df_test, num_iteration=gbm.best_iteration)
       
    
        
        
        del lgb_train, lgb_eval,gbm
    
    #  Avarage probabilities and auc score
    predictions = predictions / n_splits
    

    df_test['Predict_score'] = predictions
  
    return df_test['Predict_score'], mse_score



In [0]:
predcitions,avr_score = model_lgb(df_train,y, df_test, random_state = 123, n_splits = 5)

Starting cross validation




Training until validation scores don't improve for 20 rounds.
Did not meet early stopping. Best iteration is:
[100]	valid_0's l2: 42799.3
Starting predicting...
Training until validation scores don't improve for 20 rounds.
Did not meet early stopping. Best iteration is:
[100]	valid_0's l2: 42462.8
Starting predicting...
Training until validation scores don't improve for 20 rounds.
Did not meet early stopping. Best iteration is:
[100]	valid_0's l2: 41652.5
Starting predicting...
Training until validation scores don't improve for 20 rounds.
Did not meet early stopping. Best iteration is:
[100]	valid_0's l2: 41235.7
Starting predicting...
Training until validation scores don't improve for 20 rounds.
Did not meet early stopping. Best iteration is:
[99]	valid_0's l2: 40587.6
Starting predicting...


In [0]:
mean_squared_error(y_test, predcitions)

41193.56030789585

In [0]:
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV
from catboost import CatBoostRegressor

In [0]:
params = {'iterations': 1000,
          'learning_rate':0.1,
          'depth':8,
          'loss_function':'RMSE',
          'metric_period' : 50,
          'l2_leaf_reg': 0.1,
          'subsample':0.8,
          'early_stopping_rounds':10
          }
model = CatBoostRegressor(**params)


In [0]:
model.fit(df_train,y)

0:	learn: 211.0478068	total: 2.24s	remaining: 37m 14s
50:	learn: 202.3939138	total: 1m 37s	remaining: 30m 13s
100:	learn: 198.7838276	total: 2m 59s	remaining: 26m 37s
150:	learn: 195.3626430	total: 4m 12s	remaining: 23m 42s
200:	learn: 192.1775267	total: 5m 25s	remaining: 21m 32s
250:	learn: 189.4240827	total: 6m 36s	remaining: 19m 44s
300:	learn: 186.9011422	total: 7m 49s	remaining: 18m 9s
350:	learn: 184.4694557	total: 9m 2s	remaining: 16m 42s
400:	learn: 182.2155340	total: 10m 12s	remaining: 15m 14s
450:	learn: 180.0673357	total: 11m 23s	remaining: 13m 52s
500:	learn: 178.0081618	total: 12m 34s	remaining: 12m 31s
550:	learn: 176.0371018	total: 13m 46s	remaining: 11m 13s
600:	learn: 174.1727311	total: 14m 58s	remaining: 9m 56s
650:	learn: 172.3998829	total: 16m 9s	remaining: 8m 39s
700:	learn: 170.6819273	total: 17m 22s	remaining: 7m 24s
750:	learn: 168.9812965	total: 18m 35s	remaining: 6m 9s
800:	learn: 167.3448479	total: 19m 47s	remaining: 4m 54s
850:	learn: 165.7451838	total: 20m 

<catboost.core.CatBoostRegressor at 0x7f22460cea58>

In [0]:
model.save_model('/content/drive/My Drive/Kaggle/Twits/catboost_model.cbm')

In [0]:
predictions = model.predict(df_test)

In [0]:
mean_squared_error(y_test, predictions)

41775.52772067411