In [4]:
#!pip install nb_black
#%load_ext nb_black

from pyspark.sql import SparkSession
import pyspark.sql.functions as F
import pyspark.sql.types as T

spark = SparkSession.builder.getOrCreate()

In [5]:
spark


In [2]:
from pyspark.sql.functions import regexp_replace
from pyspark.sql import functions as F
import time
from pyspark.ml import Pipeline, PipelineModel
from pyspark.ml.feature import (
    Tokenizer,
    RegexTokenizer,
    StopWordsRemover,
    CountVectorizer,
    IDF,
    StringIndexer,
    StandardScaler,
)

from pyspark.ml.classification import LogisticRegression, LinearSVC
from pyspark.ml.evaluation import BinaryClassificationEvaluator





# LOAD
df = spark.read.load("gs://colmbo_bdpp_bucket1/datasetB1.csv", format="csv", sep="|", header="true")


#df.printSchema()

#df.show(2)
#df.count()
df = df.select("text", "label")


# TRAIN/TEST SPLIT

train, test = df.randomSplit([0.7, 0.3], seed=8984)

#print(f"Train set length: {train.count()} records")
#print(f"Test set length: {test.count()} records")

#train.first()


# CLEAN

# drop null values
train.dropna()
test.dropna()

#print(f"Train set length: {train.count()} records")


# drop numbers
train.withColumn("text", F.regexp_replace(F.col("text"), "\d+", ""))

#train.withColumn("text", F.regexp_replace(F.col("text"), "\!+", ""))
train.select('text').replace(".", "")

#train.head(2)

start = time.time()

# PIPELINE


regexTokenizer = RegexTokenizer(
    inputCol="text", outputCol="words", pattern="\\W", minTokenLength=2
)

remover = StopWordsRemover(inputCol="words", outputCol="filtered")
StopWordsRemover.loadDefaultStopWords("english")

cv = CountVectorizer(
    inputCol="filtered", outputCol="counts", minDF=2.0 , vocabSize=1500)

idf = IDF(
    inputCol="counts", outputCol="features", minDocFreq=10
)  # minDocFreq: remove sparse terms

label_stringIdx = StringIndexer(inputCol="label", outputCol="index")
label_stringIdx.setHandleInvalid("skip")

scaler = StandardScaler(
    inputCol="features", outputCol="scaledFeatures", withStd=True, withMean=True
)


classifier = LinearSVC(featuresCol="features", labelCol="index", maxIter=10, regParam=0.1)

pipeline = Pipeline(
    stages=[regexTokenizer, remover, cv, idf, label_stringIdx,classifier]
)  

pipelineFit = pipeline.fit(train)


train_time = time.time()-start
print("TRAIN TIME : ",train_time, "s")


TRAIN TIME :  50.52408504486084 s


In [3]:
from pyspark.sql.functions import countDistinct


start = time.time()
predictions = pipelineFit.transform(df)
test_time = time.time()-start
print("TEST TIME : ",test_time, "s")

#predictions.printSchema()

# predictions.select(["features", "index", "prediction"]).show()
#predictions.select(["features", "index", "filtered"]).show()


#predictions.select(F.countDistinct("index")).show()

predictions.groupBy("label").count().show()
# .orderBy()
# print(l)

# predictions.select("counts")

TEST TIME :  0.22912049293518066 s
+-----+-----+
|label|count|
+-----+-----+
|    0|25001|
|    1|25000|
+-----+-----+

