In [111]:
import re
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import nltk
import numpy as np
import findspark
findspark.init()
from nltk.corpus import stopwords
from pyspark.sql import SparkSession
from pyspark.sql.functions import udf, pandas_udf,col, lower, regexp_replace
from pyspark.sql.types import StructType, StructField, IntegerType, StringType, ArrayType
from pyspark.ml.feature import CountVectorizer, StringIndexer, Tokenizer, StopWordsRemover
from pyspark.ml.classification import LogisticRegression, RandomForestClassifier, DecisionTreeClassifier
from pyspark.ml.evaluation import MulticlassClassificationEvaluator
from pyspark.mllib.evaluation import MulticlassMetrics
from sklearn.metrics import confusion_matrix
from pyspark.ml import PipelineModel, Pipeline, Transformer
from pyspark.ml.param.shared import HasInputCol, HasOutputCol, Param
from pyspark.ml.util import DefaultParamsWritable, DefaultParamsReadable
nltk.download('stopwords')
nltk.download('punkt')

# Define English stopwords
stop_words = stopwords.words('english')

[nltk_data] Downloading package stopwords to /home/kafka/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /home/kafka/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [6]:
# Create a SparkSession
spark = SparkSession \
        .builder \
        .appName("Text Classification with PySpark") \
        .config("spark.some.config.option", "some-value") \
        .getOrCreate()

24/08/31 11:44:50 WARN Utils: Your hostname, asm-hp-250 resolves to a loopback address: 127.0.0.2, but we couldn't find any external IP address!
24/08/31 11:44:50 WARN Utils: Set SPARK_LOCAL_IP if you need to bind to another address
Using Spark's default log4j profile: org/apache/spark/log4j-defaults.properties
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
24/08/31 11:44:54 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
24/08/31 11:45:01 WARN MacAddressUtil: Failed to find a usable hardware address from the network interfaces; using random bytes: b6:7d:8c:14:78:8a:58:4c


In [9]:
# Load the data
data = spark.read.csv('/home/kafka/Desktop/twitter_sentiment_analysis/twitter_training.csv', header=False, inferSchema=True)
validation = spark.read.csv('/home/kafka/Desktop/twitter_sentiment_analysis/twitter_validation.csv', header=False, inferSchema=True)


                                                                                

In [11]:
# Define column names
columns = ['id', 'Company', 'Label', 'Text']

# Rename columns
for i, col in enumerate(columns):
    data = data.withColumnRenamed('_c{}'.format(i), col)
    validation = validation.withColumnRenamed('_c{}'.format(i), col)

In [13]:
data.printSchema()

root
 |-- id: integer (nullable = true)
 |-- Company: string (nullable = true)
 |-- Label: string (nullable = true)
 |-- Text: string (nullable = true)



In [15]:
# Drop rows with empty 'Text' column
data = data.dropna(subset=['Text'])
validation = validation.dropna(subset=['Text'])

In [17]:
data.select("Text").show(10)

+--------------------+
|                Text|
+--------------------+
|im getting on bor...|
|I am coming to th...|
|im getting on bor...|
|im coming on bord...|
|im getting on bor...|
|im getting into b...|
|So I spent a few ...|
|So I spent a coup...|
|So I spent a few ...|
|So I spent a few ...|
+--------------------+
only showing top 10 rows



In [19]:
# Define the StringIndexer for the label column (index the labels)
label_indexer = StringIndexer(inputCol="Label", outputCol="Label2")
# # Define your index mapping
# class_index_mapping = { "Negative": 0, "Positive": 1, "Neutral": 2, "Irrelevant": 3 }

# Fit StringIndexer on data
label_indexer_model = label_indexer.fit(data)
data = label_indexer_model.transform(data)
validation = label_indexer_model.transform(validation)

# Extract label mapping
label_mapping = label_indexer_model.labels

# Print label mapping
print("Label Mapping:")
for index, label in enumerate(label_mapping):
    print(f"Index {index} --> Label '{label}'")

                                                                                

Label Mapping:
Index 0 --> Label 'Negative'
Index 1 --> Label 'Positive'
Index 2 --> Label 'Neutral'
Index 3 --> Label 'Irrelevant'


In [20]:
def clean_text(df, inputCol="Text", outputCol="cleaned_text"):
    # Remove links starting with https://, http://, www., or containing .com
    df = df.withColumn(outputCol, regexp_replace(df[inputCol], r'https?://\S+|www\.\S+|S+\.com\S+|youtu\.be/\S+', ''))
    # Remove words starting with # or @
    df = df.withColumn(outputCol, regexp_replace(df[outputCol], r'(@|#)\w+', ''))
    # Convert text to lowercase
    df = df.withColumn(outputCol, lower(df[outputCol]))
    # Remove non-alpha characters
    df = df.withColumn(outputCol, regexp_replace(df[outputCol], r'[^a-zA-Z\s]', ''))
    
    return df



In [23]:
cleaned_data = clean_text(data, inputCol="Text", outputCol="Text")
cleaned_validation = clean_text(validation, inputCol="Text", outputCol="Text")

In [25]:
# Define tokenizer
tokenizer = Tokenizer(inputCol="Text", outputCol="tokens")
# Define stopwords remover
stopwords_remover = StopWordsRemover(inputCol="tokens", outputCol="filtered_tokens", stopWords=stop_words)

# Define CountVectorizer
count_vectorizer = CountVectorizer(inputCol="filtered_tokens", outputCol="features", vocabSize=10000, minDF=5)

# Define Logistic Regression
lr = LogisticRegression(maxIter=10, labelCol="Label2", featuresCol="features")

In [27]:
# create the pipeline
pipeline = Pipeline(stages=[tokenizer, stopwords_remover, count_vectorizer, lr])

# Apply the pipeline to the data
model = pipeline.fit(cleaned_data)
processed_data = model.transform(cleaned_data)

24/08/31 11:47:24 WARN BLAS: Failed to load implementation from: com.github.fommil.netlib.NativeSystemBLAS
24/08/31 11:47:24 WARN BLAS: Failed to load implementation from: com.github.fommil.netlib.NativeRefBLAS
                                                                                

In [28]:
processed_data.printSchema()

root
 |-- id: integer (nullable = true)
 |-- Company: string (nullable = true)
 |-- Label: string (nullable = true)
 |-- Text: string (nullable = true)
 |-- Label2: double (nullable = false)
 |-- tokens: array (nullable = true)
 |    |-- element: string (containsNull = true)
 |-- filtered_tokens: array (nullable = true)
 |    |-- element: string (containsNull = true)
 |-- features: vector (nullable = true)
 |-- rawPrediction: vector (nullable = true)
 |-- probability: vector (nullable = true)
 |-- prediction: double (nullable = false)



In [34]:
processed_data.select("Text", "Label2", "prediction").show()

+--------------------+------+----------+
|                Text|Label2|prediction|
+--------------------+------+----------+
|im getting on bor...|   1.0|       1.0|
|i am coming to th...|   1.0|       1.0|
|im getting on bor...|   1.0|       1.0|
|im coming on bord...|   1.0|       1.0|
|im getting on bor...|   1.0|       1.0|
|im getting into b...|   1.0|       1.0|
|so i spent a few ...|   1.0|       1.0|
|so i spent a coup...|   1.0|       1.0|
|so i spent a few ...|   1.0|       1.0|
|so i spent a few ...|   1.0|       1.0|
| so i spent a few...|   1.0|       1.0|
|                 was|   1.0|       0.0|
|rockhard la varlo...|   2.0|       2.0|
|rockhard la varlo...|   2.0|       2.0|
|rockhard la varlo...|   2.0|       2.0|
|rockhard la vita ...|   2.0|       2.0|
|live rock  hard m...|   2.0|       2.0|
|ihard like me rar...|   2.0|       2.0|
|that was the firs...|   1.0|       1.0|
|this was the firs...|   1.0|       1.0|
+--------------------+------+----------+
only showing top

In [37]:
# Save the model
model.write().overwrite().save("logistic_regression_model.pkl")

                                                                                

In [41]:
from pyspark.ml import PipelineModel
from sklearn.feature_extraction.text import CountVectorizer
import sys
from sklearn.feature_extraction.text import TfidfVectorizer

# Load the model
loaded_model = PipelineModel.load("/home/kafka/Desktop/twitter_sentiment_analysis/logistic_regression_model.pkl")


Traceback (most recent call last):
  File "/opt/spark/python/pyspark/serializers.py", line 437, in dumps
    return cloudpickle.dumps(obj, pickle_protocol)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/opt/spark/python/pyspark/cloudpickle/cloudpickle_fast.py", line 73, in dumps
    cp.dump(obj)
  File "/opt/spark/python/pyspark/cloudpickle/cloudpickle_fast.py", line 563, in dump
    return Pickler.dump(self, obj)
           ^^^^^^^^^^^^^^^^^^^^^^^
  File "/opt/spark/python/pyspark/cloudpickle/cloudpickle_fast.py", line 653, in reducer_override
    return self._function_reduce(obj)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/opt/spark/python/pyspark/cloudpickle/cloudpickle_fast.py", line 526, in _function_reduce
    return self._dynamic_function_reduce(obj)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/opt/spark/python/pyspark/cloudpickle/cloudpickle_fast.py", line 507, in _dynamic_function_reduce
    state = _function_getstate(func)
            ^^^^^^^^^^^^^^^

PicklingError: Could not serialize object: IndexError: tuple index out of range

In [61]:
spark = SparkSession.builder \
    .appName("YourAppName") \
    .config("spark.driver.memory", "4g") \
    .config("spark.executor.memory", "4g") \
    .getOrCreate()



In [69]:
# Extract 'Text' and 'Label2' columns without using lambda
# X_train = [row['Text'] for row in cleaned_data.select('Text').collect()]
# y_train = [row['Label2'] for row in cleaned_data.select('Label2').collect()]
X_train = [row['Text'] for row in cleaned_data.select('Text').limit(1000).collect()]
y_train = [row['Label2'] for row in cleaned_data.select('Label2').limit(1000).collect()]

X_test = [row['Text'] for row in cleaned_validation.select('Text').limit(1000).collect()]
y_test = [row['Label2'] for row in cleaned_validation.select('Label2').limit(1000).collect()]


In [75]:
from sklearn.feature_extraction.text import CountVectorizer

# Initialize the CountVectorizer
matrix = CountVectorizer(analyzer='word', max_features=1000, ngram_range=(1, 3))

# Transform X_train and X_test from text to feature vectors
X_train = matrix.fit_transform(X_train).toarray()  # Fit and transform on the training data
X_test = matrix.transform(X_test).toarray()        # Transform the validation/test data using the same matrix

# Now you can check the shape if needed
print("X_train shape:", X_train.shape)
print("X_test shape:", X_test.shape)



X_train shape: (1000, 1000)
X_test shape: (1000, 1000)


# intialization 

In [79]:
# Naive Bayes 
from sklearn.naive_bayes import GaussianNB
classifier = GaussianNB()
classifier.fit(X_train, y_train)


In [86]:
# Predict Class
y_pred = classifier.predict(X_test)

# Predict Class


In [89]:
cleaned_validation.show(10)

+-----+--------------------+----------+--------------------+------+
|   id|             Company|     Label|                Text|Label2|
+-----+--------------------+----------+--------------------+------+
| 3364|            Facebook|Irrelevant|i mentioned on fa...|   3.0|
|  352|              Amazon|   Neutral|bbc news  amazon ...|   2.0|
| 8312|           Microsoft|  Negative| why do i pay for...|   0.0|
| 4371|               CS-GO|  Negative|csgo matchmaking ...|   0.0|
| 4433|              Google|   Neutral|now the president...|   2.0|
| 6273|                FIFA|  Negative|hi  ive had madel...|   0.0|
| 7925|           MaddenNFL|  Positive|         thank you  |   1.0|
|11332|TomClancysRainbowSix|  Positive|rocket league sea...|   1.0|
| 1107|      AssassinsCreed|  Positive|my ass still knee...|   1.0|
| 2069|          CallOfDuty|  Negative|fix it jesus  ple...|   0.0|
+-----+--------------------+----------+--------------------+------+
only showing top 10 rows



In [91]:
 # Accuracy 
from sklearn.metrics import accuracy_score
accuracy = accuracy_score(y_test, y_pred)

accuracy

0.292

## A very low accuracy of 0.0.292 at first but when Mixed sentiment was removed, accuracy increased to 52.75%

In [100]:
from sklearn.metrics import classification_report
print(classification_report(y_test, y_pred, target_names=['neutral', 'negative', 'positive','irrelevant']))

              precision    recall  f1-score   support

     neutral       0.35      0.26      0.30       266
    negative       0.32      0.35      0.33       277
    positive       0.31      0.32      0.31       285
  irrelevant       0.17      0.22      0.19       172

    accuracy                           0.29      1000
   macro avg       0.29      0.28      0.28      1000
weighted avg       0.30      0.29      0.29      1000



# Base class for Model 3, 4 & 5

In [154]:
class Base:
    """Base class that houses common utilities for reading in test data
    and calculating model accuracy and F1 scores.
    """
    def __init__(self) -> None:
        pass

    def read_data(self, fname: str, lower_case: bool=False,
                  colnames=['truth', 'text']) -> pd.DataFrame:
        "Read in test data into a Pandas DataFrame"
        df = pd.read_csv(fname, sep='\t', header=None, names=colnames)
        df['truth'] = df['truth'].str.replace('__label__', '')
        # Categorical data type for truth labels
        df['truth'] = df['truth'].astype(int).astype('category')

        # Optional lowercase for test data (if model was trained on lowercased text)
        if lower_case:
            df['text'] = df['text'].str.lower()
        return df

    def accuracy(self, df: pd.DataFrame) -> None:
        "Prediction accuracy (percentage) and F1 score"

        acc = accuracy_score(df['sentiment'], df['pred'])*100
        f1 = f1_score(df['sentiment'], df['pred'], average='macro')*100

        recall = recall_score(df['sentiment'], df['pred'], average='macro')*100
        precision = precision_score(df['sentiment'], df['pred'], average='macro')*100
        newf1 = 2 * recall * precision / (recall + precision)

        df.to_csv("result.csv",index = False)

        print(len(df))
        print(
            "Accuracy: {:.3f}\nMacro F1-score: {:.3f}\nMacro recall: {:.3f}\nMacro precission: {:.3f}\nNew F1 MAcro: {:.3f}".format(
                acc, f1, recall, precision, newf1))
        print("{:.2f} & {:.2f} & {:.2f} & {:.2f}".format(acc, recall, precision, newf1))


# Model 3: Text blob sentiment

In [223]:
class TextBlobSentiment(Base):
    """Predict sentiment scores using TextBlob.
    https://textblob.readthedocs.io/en/dev/
    """
    def __init__(self, model_file: str=None) -> None:
        super().__init__()

    def score(self, text: str) -> float:
        # pip install textblob
        from textblob import TextBlob
        return TextBlob(text).sentiment.polarity

    def predict(self, df_train, df_test, lower_case: bool) -> pd.DataFrame:
        df_test['score'] = df_test['tweet'].apply(self.score)
        # Convert float score to category based on binning
        df_test['pred'] = pd.cut(df_test['score'],
                            bins=3,
                            labels=["negative", "neutral", "positive"])
        df = df_test.drop('score', axis=1)
        return df

# Initialize the model

In [226]:
text_blob = TextBlobSentiment()


In [230]:
import pandas as pd

# Ensure df_train and df_test are DataFrames
if isinstance(X_train, list):
    X_train = pd.DataFrame(X_train)
if isinstance(X_test, list):
    X_test = pd.DataFrame(X_test)

# Check if the required column exists
print(X_test.columns)  # Debugging step

# Replace 'tweet' with the correct column name, e.g., 'Text'
X_test['score'] = X_test['Text'].apply(self.score)

# Continue with your processing
result = text_blob.predict(X_train, y_test, True)


RangeIndex(start=0, stop=1, step=1)


KeyError: 'Text'

# Train the model

In [171]:


result = text_blob.predict(X_train, X_test, True)

TypeError: list indices must be integers or slices, not str

In [213]:
class SVMSentiment(Base):
    """Predict sentiment scores using a linear Support Vector Machine (SVM).
    Uses a sklearn pipeline.
    """
    def __init__(self, model_file: str=None) -> None:
        super().__init__()
        # pip install sklearn
        from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer
        from sklearn.linear_model import SGDClassifier
        from sklearn.pipeline import Pipeline
        self.pipeline = Pipeline(
            [
                ('vect', CountVectorizer()),
                ('tfidf', TfidfTransformer()),
                ('clf', SGDClassifier(
                    loss='hinge',
                    penalty='l2',
                    alpha=1e-3,
                    random_state=42,
                    max_iter=100,
                    learning_rate='optimal',
                    tol=None,
                )),
            ]
        )

    def predict(self, df_train, df_test, lower_case: bool) -> pd.DataFrame:
        "Train model using sklearn pipeline"
        learner = self.pipeline.fit(df_train['Text'], df_train['sentiment'])

        # Fit the learner to the test data
        df_test['pred'] = learner.predict(df_test['Text'])
        return df_test

In [215]:
svm = SVMSentiment()

In [217]:
result = svm.predict(X_train, X_test, True)

KeyError: 'Text'

#  scalling values

In [145]:
from sklearn.feature_extraction.text import TfidfVectorizer

# Initialize the TfidfVectorizer with the correct 'analyzer' parameter
matrix = TfidfVectorizer(analyzer='word', max_features=1000, ngram_range=(1, 3))

# Transform X_train and X_test from text to feature vectors
X_train = matrix.fit_transform(X_train).toarray()
X_test = matrix.transform(X_test).toarray()

# Continue with your processing
np.set_printoptions(threshold=sys.maxsize)


AttributeError: 'list' object has no attribute 'lower'

In [None]:
classifier = GaussianNB()
classifier.fit(X_train, y_train)

In [None]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.naive_bayes import GaussianNB
from sklearn.model_selection import train_test_split

# Initialize the CountVectorizer
matrix = CountVectorizer(analyzer='word', max_features=1000, ngram_range=(1, 3))

# Transform X_train and X_test from text to feature vectors
X_train = matrix.fit_transform(X_train).toarray()  # Fit and transform on the training data
X_test = matrix.transform(X_test).toarray()        # Transform the test data

# Ensure y_train and y_test are numpy arrays with the correct shape
y_train = np.array(y_train)
y_test = np.array(y_test)

# Print shapes to debug
print("X_train shape:", X_train.shape)
print("y_train shape:", y_train.shape)

# Check if X_train is 2D and y_train is 1D
if len(X_train.shape) != 2 or len(y_train.shape) != 1:
    print("Reshaping needed.")
    # Example: reshape if y_train is a 2D array with a single column
    if len(y_train.shape) == 2 and y_train.shape[1] == 1:
        y_train = y_train.ravel()  # Convert to 1D array

# Train the model
classifier = GaussianNB()
classifier.fit(X_train, y_train)


In [35]:
# Apply the pipeline to the new data
processed_validation = loaded_model.transform(cleaned_validation)

# Optionally, you can select specific columns for evaluation
selected_data = processed_validation.select("id", "Text", "prediction", "Label2")

# Show the processed data
selected_data.show()


NameError: name 'loaded_model' is not defined

In [37]:
# Evaluate accuracy
evaluator = MulticlassClassificationEvaluator(labelCol="Label2", predictionCol="prediction", metricName="accuracy")
accuracy = evaluator.evaluate(processed_validation)
print("Accuracy:", accuracy)

# --- Create a confusion matrix

# Convert Spark DataFrame to Pandas DataFrame
prediction_and_label_pd = processed_validation.select("prediction", "Label2").toPandas()

# Extract predicted labels and true labels
predicted_labels = prediction_and_label_pd["prediction"].tolist()
true_labels = prediction_and_label_pd["Label2"].tolist()

# Create confusion matrix
conf_matrix = confusion_matrix(true_labels, predicted_labels)
print("Confusion Matrix:")
# Plot confusion matrix using Seaborn
plt.figure(figsize=(8, 6))
sns.heatmap(conf_matrix, annot=True, fmt="d", cmap="Blues", xticklabels=True, yticklabels=True)
plt.xlabel("Predicted Labels")
plt.ylabel("True Labels")
plt.title("Confusion Matrix")
plt.show()

NameError: name 'processed_validation' is not defined