In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
!wget http://setup.johnsnowlabs.com/colab.sh -O - | bash

--2021-04-17 15:14:52--  http://setup.johnsnowlabs.com/colab.sh
Resolving setup.johnsnowlabs.com (setup.johnsnowlabs.com)... 51.158.130.26
Connecting to setup.johnsnowlabs.com (setup.johnsnowlabs.com)|51.158.130.26|:80... connected.
HTTP request sent, awaiting response... 302 Moved Temporarily
Location: https://raw.githubusercontent.com/JohnSnowLabs/spark-nlp/master/scripts/colab_setup.sh [following]
--2021-04-17 15:14:52--  https://raw.githubusercontent.com/JohnSnowLabs/spark-nlp/master/scripts/colab_setup.sh
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.108.133, 185.199.109.133, 185.199.110.133, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.108.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 1594 (1.6K) [text/plain]
Saving to: ‘STDOUT’


2021-04-17 15:14:53 (34.0 MB/s) - written to stdout [1594/1594]

setup Colab for PySpark 3.0.2 and Spark NLP 3.0.1
[K     |████████████████████████████████|

In [None]:
import pandas as pd
import pyspark.sql.functions as F
from pyspark.sql.types import StringType,StructType,StructField,IntegerType
from pyspark.ml.feature import VectorAssembler
from pyspark.ml.evaluation import MulticlassClassificationEvaluator
from pyspark.ml.classification import LogisticRegression
import sparknlp
from sparknlp.base import *
from sparknlp.annotator import *
from pyspark.ml import Pipeline
import json
class BuildModel:
  def __init__(self,path):
    self.spark = sparknlp.start()
    self.prepare_data(path)
    self.load_pipe_components()
    self.create_fin_pipeline()
    self.evaluation()
    
  def process(self):
    l = []
    for i in range(len(self.fin_tfolds)):
      print('ROUND '+str(i)+' started')
      l.append(self.run_training(self.fin_tfolds[i],self.fin_vfolds[i],i))
      print('ROUND '+str(i)+' completed')
    return l  

    
  def prepare_data(self,path):
    df = pd.read_csv(path)
    if 'sarcasm' in list(df.columns):
      df = df.drop('sarcasm',axis=1)
      df = df.rename({'sar_num':'target'})
    elif 'final_sent_class' in list(df.columns):
      df = df.drop('final_sent_class',axis=1)
      df = df.rename({'sentiment':'target'})
          
    tfolds = [df[df.kfold != fold].drop('kfold',axis=1).reset_index(drop=True) for fold in range(5)]
    vfolds = [df[df.kfold == fold].drop('kfold',axis=1).reset_index(drop=True) for fold in range(5)]
    schema = StructType([StructField('id', StringType(), True),                     
                      StructField('text', StringType(), True),
                      StructField('label', IntegerType(), True)])
                      
    self.fin_tfolds = [self.spark.createDataFrame(data,schema) for data in tfolds]
    self.fin_vfolds = [self.spark.createDataFrame(data,schema) for data in vfolds]
    
  def load_pipe_components(self):
    self.document = DocumentAssembler().setInputCol("text").setOutputCol("document")
    # self.tokenizer = Tokenizer().setInputCols(["document"]).setOutputCol("token")
    # self.spellModel = ContextSpellCheckerModel.pretrained().setInputCols("token").setOutputCol("spell")
    # self.lemmatizer = LemmatizerModel.pretrained(name="lemma_antbnc").setInputCols(["spell"]).setOutputCol("lemma").setLazyAnnotator(False)
    self.sentenceDetector = SentenceDetector().setInputCols(["document"]).setOutputCol("sentence")
    self.use = UniversalSentenceEncoder.pretrained().setInputCols(["sentence"]).setOutputCol("embeddings")
    # self.embeddings = BertEmbeddings.pretrained("small_bert_L12_128", "en").setInputCols("sentence", "lemma").setOutputCol("embeddings")
    # self.sen_emb = SentenceEmbeddings().setInputCols(["document","embeddings"]).setOutputCol("sen_emb").setPoolingStrategy("AVERAGE")
    self.embeddings_finisher = EmbeddingsFinisher().setInputCols("embeddings").setOutputCols("embeddings_vectors").setCleanAnnotations(True).setOutputAsVector(True)
    # self.assembler = VectorAssembler(inputCols=['embeddings_vectors'],outputCol='features')
    self.ml = LogisticRegression(featuresCol='input',labelCol='label')
    
    
    #vec_assembler
    #ml model

  def create_fin_pipeline(self):
    
    self.nlp_pipeline = Pipeline(stages = [
          self.document,
          # self.tokenizer,
          # self.spellModel,
          # self.lemmatizer,
          self.sentenceDetector,
          self.use,
          # self.embeddings,
          # self.sen_emb,
          self.embeddings_finisher
          # self.assembler,
          ])
    self.ml_pipeline = Pipeline(stages = [self.ml])
  

  def run_training(self,df_tr,df_val,i):
    df_tr_emb = self.nlp_pipeline.fit(df_tr).transform(df_tr)
    df_tr_emb = df_tr_emb.select('id',F.explode('embeddings_vectors').alias('input'),'label')
    self.model = self.ml_pipeline.fit(df_tr_emb)
    df_val_emb = self.nlp_pipeline.fit(df_val).transform(df_val)
    df_val_emb = df_val_emb.select('id',F.explode('embeddings_vectors').alias('input'))
    df_res = self.model.transform(df_val_emb)
    df_eval = df_res.join(df_val,on='id',how='inner').select('label','prediction')
    acc = self.evaluator.evaluate(df_eval, {self.evaluator.metricName: "accuracy"})
    f1 = self.evaluator.evaluate(df_eval, {self.evaluator.metricName: "f1"})
    weightedPrecision = self.evaluator.evaluate(df_eval, {self.evaluator.metricName: "weightedPrecision"})
    weightedRecall = self.evaluator.evaluate(df_eval, {self.evaluator.metricName: "weightedRecall"})

    return {'accuracy': acc,'f1':f1,'weighted precision': weightedPrecision,'weighted recall':weightedRecall}
    
  def evaluation(self):
    self.evaluator = MulticlassClassificationEvaluator(labelCol="label", predictionCol="prediction")
    
  def save(self):
    return self.model

In [None]:
bm = BuildModel('/content/drive/MyDrive/sarcasm data/train_sarc_final_with_folds.csv')

tfhub_use download started this may take some time.
Approximate size to download 923.7 MB
[OK!]


In [None]:
df_pan = pd.read_csv('/content/drive/MyDrive/sarcasm data/train_sarc_final_with_folds.csv')

In [None]:
x_tr = df_pan[df_pan['kfold']!=0]
x_val = df_pan[df_pan['kfold']==0]

In [None]:
x_tr = x_tr.drop(['sarcasm','kfold'],axis = 1)
x_val = x_val.drop(['sarcasm','kfold'],axis = 1)

In [None]:

from sklearn.model_selection import train_test_split
import pyspark.sql.functions as F
from pyspark.sql.types import StringType,StructType,StructField,IntegerType
from pyspark.ml.evaluation import MulticlassClassificationEvaluator
from pyspark.ml.classification import LogisticRegression
from pyspark.ml.linalg import VectorUDT, DenseVector
import sparknlp
from sparknlp.base import *
from sparknlp.annotator import *
from pyspark.ml import Pipeline
import pandas as pd
import time
prog_start_time = time.time()
import json

spark = sparknlp.start(gpu=True) 
schema_sarc = StructType([StructField('id', StringType(), True),                     
                      StructField('text', StringType(), True),
                      StructField('label', IntegerType(), True)])


# df_pan = pd.read_csv('/content/drive/MyDrive/sarcasm data/train_sarc_final.csv')
# df_src = df_pan[df_pan['sarcasm']=='sarcasm'].sample(n=50000)
# df_nsrc = df_pan[df_pan['sarcasm']=='normal'].sample(n=50000)

# df_pan = pd.concat([df_src,df_nsrc],ignore_index=True)
# df_pan = df_pan.sample(frac=1).reset_index(drop=True)
# d = {'sarcasm':1,'normal':0}
# df_pan['label'] = df_pan['sarcasm'].apply(lambda x: d[x])
# df_pan = df_pan.drop('sarcasm',axis = 1)
# x_tr,x_ts,y_tr,y_ts = train_test_split(df_pan.drop('label',axis = 1),df_pan['label'],test_size=0.25,stratify=df_pan['label'])
# # print(x_tr.columns)
# x_tr['label'] = y_tr
# x_ts['label'] = y_ts 
df_sp = spark.createDataFrame(df,schema_sarc)

document = DocumentAssembler().setInputCol("text").setOutputCol("document")

# tokenizer = Tokenizer().setInputCols(["document"]).setOutputCol("token")
# spellModel = ContextSpellCheckerModel.pretrained().setInputCols("token").setOutputCol("spell")

sentenceDetector = SentenceDetector().setInputCols(["document"]).setOutputCol("sentence")
use = UniversalSentenceEncoder.pretrained().setInputCols(["sentence"]).setOutputCol("embeddings")
# embeddings = BertSentenceEmbeddings.pretrained("sent_electra_small_uncased", "en").setInputCols(["sentence","spell"]).setOutputCol("embeddings")
embeddings_finisher = EmbeddingsFinisher().setInputCols("embeddings").setOutputCols("embeddings_vectors").setCleanAnnotations(True).setOutputAsVector(True)
# ml = LogisticRegression(featuresCol='input',labelCol='label')
lemm_pipeline = Pipeline(
    stages = [
        document,
        # tokenizer,
        # spellModel,
       #  lemmatizer,
        sentenceDetector,
        use,
		embeddings_finisher
        # spell_checker,
#         lemmatizer
    ])

ml_pipeline = Pipeline(stages = [ml])
df_emb = lemm_pipeline.fit(df_sp).transform(df_sp)
# df_emb_ts = lemm_pipeline.fit(df_ts).transform(df_ts)
df_emb = df_emb.select('id',F.explode('embeddings_vectors').alias('input'),'label')
# df_emb_ts = df_emb_ts.select('id',F.explode('embeddings_vectors').alias('input'),'label')

model = ml_pipeline.fit(df_emb)
# df_res = model.transform(df_emb_ts.drop('label'))
# df_eval = df_res.join(df_emb_ts,on='id',how='inner').select('label','prediction')
# evaluator = MulticlassClassificationEvaluator(labelCol="label", predictionCol="prediction")

# acc = evaluator.evaluate(df_eval, {evaluator.metricName: "accuracy"})
# f1 = evaluator.evaluate(df_eval, {evaluator.metricName: "f1"})
# weightedPrecision = evaluator.evaluate(df_eval, {evaluator.metricName: "weightedPrecision"})
# weightedRecall = evaluator.evaluate(df_eval, {evaluator.metricName: "weightedRecall"})
 
# smry = {'accuracy': acc,'f1':f1,'weighted precision': weightedPrecision,'weighted recall':weightedRecall}
# print(smry)

tfhub_use download started this may take some time.
Approximate size to download 923.7 MB
[OK!]


In [None]:
x = bm.process()

ROUND 0 started
ROUND 0 completed
ROUND 1 started
ROUND 1 completed
ROUND 2 started
ROUND 2 completed
ROUND 3 started
ROUND 3 completed
ROUND 4 started
ROUND 4 completed


In [None]:
mod = bm.save()

In [None]:
df = pd.read_csv('/content/drive/MyDrive/sarcasm data/train_sarc_final_with_folds.csv')


In [None]:
schema_sarc = StructType([StructField('id', StringType(), True),                     
                      StructField('text', StringType(), True),
                      StructField('label', IntegerType(), True)])

In [None]:
df = df.drop(['sarcasm','kfold'],axis=1)

In [None]:
df.columns

Index(['u_id', 'preprocessed', 'sar_num'], dtype='object')

In [None]:
model

PipelineModel_8e899af0a6c8

In [None]:
x

[{'accuracy': 0.9837244897959184,
  'f1': 0.9837241491575145,
  'weighted precision': 0.9837649887990588,
  'weighted recall': 0.9837244897959183},
 {'accuracy': 0.9834013605442177,
  'f1': 0.9834010157210402,
  'weighted precision': 0.9834415321215734,
  'weighted recall': 0.9834013605442177},
 {'accuracy': 0.9838775510204082,
  'f1': 0.9838772548883551,
  'weighted precision': 0.9839131038198725,
  'weighted recall': 0.9838775510204081},
 {'accuracy': 0.9836734693877551,
  'f1': 0.9836730443837043,
  'weighted precision': 0.9837238363011559,
  'weighted recall': 0.9836734693877551},
 {'accuracy': 0.9838945578231293,
  'f1': 0.9838942207441654,
  'weighted precision': 0.9839350710649235,
  'weighted recall': 0.9838945578231293}]