## News Classification using simple logistic regression approach

#### Tools used :
    - Pyspark ( for text processing , model building and data gathering)
    - Pywebio ( for building simple web applicatin for news classifier)
    
    

In [1]:
import findspark
findspark.init()

In [2]:
from pyspark.sql import SparkSession
from pyspark.sql import functions as F
from pyspark.ml.classification import LogisticRegression,DecisionTreeClassifier,RandomForestClassifier,NaiveBayes,DecisionTreeModel
from pyspark.ml.evaluation import MulticlassClassificationEvaluator
from pyspark.ml.feature import CountVectorizer,Tokenizer,StopWordsRemover,VectorAssembler,StringIndexer,HashingTF,IDF,Word2Vec
from pyspark.ml.pipeline import Pipeline
from pyspark.sql.types import *
import numpy as np
from pywebio import input,output
import time
from pywebio import start_server

In [3]:
spark = SparkSession.builder.appName("Sentiment 2").getOrCreate()
spark

## 1) Read Data from txt file

In [4]:
# schema = StructType([
#     StructField("text",StringType(),True),
#     StructField("class",StringType(),True),
# ])
# df = spark.read.options(delimiter=";").csv("train.txt",inferSchema=True,schema=schema)
# df.show()


df = spark.read.csv("train.csv",inferSchema=True,header=True)

In [5]:
df = df.withColumnRenamed("Class Index","class").withColumnRenamed("Title","text")

In [6]:
df = df.drop("Description")
df.show()

+-----+--------------------+
|class|                text|
+-----+--------------------+
|    3|Wall St. Bears Cl...|
|    3|Carlyle Looks Tow...|
|    3|Oil and Economy C...|
|    3|Iraq Halts Oil Ex...|
|    3|Oil prices soar t...|
|    3|Stocks End Up, Bu...|
|    3|Money Funds Fell ...|
|    3|Fed minutes show ...|
|    3|Safety Net (Forbe...|
|    3|Wall St. Bears Cl...|
|    3|Oil and Economy C...|
|    3|No Need for OPEC ...|
|    3|Non-OPEC Nations ...|
|    3|Google IPO Auctio...|
|    3|Dollar Falls Broa...|
|    3|Rescuing an Old S...|
|    3|Kids Rule for Bac...|
|    3|In a Down Market,...|
|    3|US trade deficit ...|
|    3|Shell 'could be t...|
+-----+--------------------+
only showing top 20 rows



##### Dataset is unbalanced so apply undersampling for sack of simplicity

In [7]:
data = df.collect()

In [8]:
## shuffle data is necessary otherwise we got bad accuracy while testing and predicting new data
np.random.shuffle(data)

In [9]:
df = spark.createDataFrame(data)

In [10]:
df.show()

+-----+--------------------+
|class|                text|
+-----+--------------------+
|    2|HOLMES CONSIDERS ...|
|    3|No Respite for Mi...|
|    4|&lt;em&gt;Reg&lt;...|
|    2|Astros #39; Jeff ...|
|    4|Borland titanic e...|
|    2|Dent, Okun win as...|
|    1|UN Official Warns...|
|    3|Telstra creates n...|
|    2|NBA-Raptors Produ...|
|    2|  Wade Impresses All|
|    1|Confusion Surroun...|
|    3|The Investment Co...|
|    1|French girls expe...|
|    4|Blocking out the bad|
|    3|NWA #39;s Anderso...|
|    4|Microsoft Sues Po...|
|    1|Cheney Is a Quiet...|
|    3|Dollar, markets d...|
|    1|Film star politic...|
|    4|The Post-Merger P...|
+-----+--------------------+
only showing top 20 rows



## 2) Data preparation
    - Tokenization
    - CountVectorizer/TF-IDF transformation

In [11]:
pipe = Pipeline(stages=[
    Tokenizer(inputCol="text",outputCol="text_tokens"),
#     HashingTF(inputCol="text_tokens",outputCol="text_tf"),
#     IDF(inputCol="text_tf",outputCol="text_vectors")
    CountVectorizer(inputCol="text_tokens",outputCol="text_vectors")
])

pipe_fitted = pipe.fit(df)

In [12]:
get_token_count = F.udf(lambda x: len(x),IntegerType())


def prepareData(data):
    fdata = pipe_fitted.transform(data)
    fdata = fdata.withColumn("token counts",get_token_count("text_tokens"))
    vc = VectorAssembler(inputCols=["text_vectors"],outputCol="features")
    fdata = vc.transform(fdata)
    return fdata

In [13]:
fdata = prepareData(df)
fdata.show()

+-----+--------------------+--------------------+--------------------+------------+--------------------+
|class|                text|         text_tokens|        text_vectors|token counts|            features|
+-----+--------------------+--------------------+--------------------+------------+--------------------+
|    2|HOLMES CONSIDERS ...|[holmes, consider...|(55563,[3,760,104...|           5|(55563,[3,760,104...|
|    3|No Respite for Mi...|[no, respite, for...|(55563,[2,24,53,8...|           4|(55563,[2,24,53,8...|
|    4|&lt;em&gt;Reg&lt;...|[&lt;em&gt;reg&lt...|(55563,[0,44,88,2...|           7|(55563,[0,44,88,2...|
|    2|Astros #39; Jeff ...|[astros, #39;, je...|(55563,[6,11,16,4...|          10|(55563,[6,11,16,4...|
|    4|Borland titanic e...|[borland, titanic...|(55563,[0,11,141,...|           9|(55563,[0,11,141,...|
|    2|Dent, Okun win as...|[dent,, okun, win...|(55563,[15,34,39,...|           8|(55563,[15,34,39,...|
|    1|UN Official Warns...|[un, official, wa...|(55563

In [14]:

inverse_class_labels = {1:"World" , 2:"Sports",3:"Business",4:"Sci/Tech"}

get_class_labels = F.udf(lambda x: x , IntegerType())
fdata = fdata.withColumn("target",get_class_labels("class"))

In [15]:
final_data = fdata.select(["features","target"])
train_data , test_data = final_data.randomSplit([0.75,0.25])


## 3) Model building

### Logistic model

In [16]:
lrmd = LogisticRegression(featuresCol="features",labelCol="target",maxIter=150)
lrmd_fitted = lrmd.fit(train_data)

In [17]:
lrmd_fitted.evaluate(train_data).accuracy

0.9928902079780283

In [18]:
lrmd_fitted.evaluate(test_data).accuracy

0.7812268534105448

### decision tree

In [25]:
dcmd = DecisionTreeClassifier(featuresCol="features",labelCol="target",maxDepth=3,maxBins=18,minInfoGain=0.2,impurity="gini")
dcmd_fitted = dcmd.fit(train_data)

In [26]:
train_results = dcmd_fitted.transform(train_data)
test_results = dcmd_fitted.transform(test_data)

In [27]:
test_results.filter(test_results.target==test_results.prediction).count()/test_results.count()

0.24901013354808402

In [78]:
evaluator = MulticlassClassificationEvaluator(predictionCol="prediction",labelCol="target")

In [79]:
print("train results : ",1 - evaluator.evaluate(train_results))
print("test results : ",1 - evaluator.evaluate(test_results))

train results :  0.89933495215833
test results :  0.9019825647035269


In [28]:
nv = NaiveBayes(featuresCol="features",labelCol="target")
nvmd = nv.fit(train_data)

In [29]:
test_preds = nvmd.transform(test_data)

In [30]:
test_preds.filter(test_preds.target==test_preds.prediction).count()/test_preds.count()

0.04422521978390712

In [21]:
def predictForText(s):
    t = spark.createDataFrame([s],StringType()).withColumnRenamed("value","text")
    t = prepareData(t)
    return inverse_class_labels[lrmd_fitted.transform(t).select("prediction").collect()[0]["prediction"]]

In [22]:
predictForText("Money fraud in india")

'Business'

In [23]:
predictForText("5g smart phones in market")

'Sci/Tech'

## Make simple web application for ML model

In [24]:
def callApp():
    output.clear()
    output.put_markdown("# News Classification Using Pyspark")
    output.style(output.put_markdown("#### Note : Possible output are Sports , Sci-Tech , Business and World ") , "color:blue")
    output.put_text("Please enter news headline in below input box")
    inputText = input.input(label="Enter news headline",type="text")
    with output.put_loading():
        c1 = predictForText(inputText)
        output.style(output.put_text("Predicted Class is ",c1),"color:green;font-size:24px")

In [None]:
callApp()