In [96]:
#!pip install contractions
#!pip install tensorflow


In [97]:
import re
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, classification_report
import tensorflow as tf
import string
from tensorflow import keras
import contractions
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

In [98]:
data = pd.read_csv('tweets.csv')
data.head()

Unnamed: 0,id,label,tweet
0,1,0,#fingerprint #Pregnancy Test https://goo.gl/h1...
1,2,0,Finally a transparant silicon case ^^ Thanks t...
2,3,0,We love this! Would you go? #talk #makememorie...
3,4,0,I'm wired I know I'm George I was made that wa...
4,5,1,What amazing service! Apple won't even talk to...


In [99]:
def preprocess_text(text):
    text = re.sub(r'http\S+', '', text)
    text = text.translate(str.maketrans('', '', string.punctuation))
    text = re.sub(r'@[A-Za-z0-9]+', '', text)
    text = re.sub(r'\d+', '', text)
    text = text.lower()
    # Handle contractions
    text = contractions.fix(text)
    # Tokenize text
    tokens = text.split()
    # Remove stopwords
    stop_words = ["a", "an", "the", "and", "or", "but", "is", "are", "am", "it", "this", "that", "of", "from", "in", "on"]
    tokens = [word for word in tokens if word not in stop_words]
    # Join tokens back into text
    processed_text = ' '.join(tokens)
    return processed_text

In [100]:
data['tweet'] = data['tweet'].apply(preprocess_text)

In [101]:
# Feature extraction
tfidf_vectorizer = TfidfVectorizer(max_features=1500, stop_words='english')
X = tfidf_vectorizer.fit_transform(data['tweet']).toarray()
y = data['label']

In [102]:
import json
# Assuming 'tfidf_vectorizer' is your TfidfVectorizer
vocabulary = tfidf_vectorizer.vocabulary_
# Convert numpy.int64 values to int
vocabulary = {word: int(index) for word, index in vocabulary.items()}

# Save the vocabulary to a JSON file
with open('vocabulary.json', 'w') as f:
    json.dump(vocabulary, f)

In [103]:
# Split the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [104]:
# Model training (Support Vector Machine)
svm_model = SVC(kernel='linear')
svm_model.fit(X_train, y_train)

In [105]:
# Model evaluation
y_pred = svm_model.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy:", accuracy)
print(classification_report(y_test, y_pred))

Accuracy: 0.8724747474747475
              precision    recall  f1-score   support

           0       0.90      0.92      0.91      1152
           1       0.78      0.74      0.76       432

    accuracy                           0.87      1584
   macro avg       0.84      0.83      0.84      1584
weighted avg       0.87      0.87      0.87      1584



In [106]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense

In [107]:
# Define a neural network architecture
model = Sequential([
    Dense(128, activation='relu', input_shape=(X_train.shape[1],)),
    Dense(64, activation='relu'),
    Dense(1, activation='sigmoid')
])

In [108]:
# Compile the model
model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

In [109]:
# Convert the model to TensorFlow Lite format
converter = tf.lite.TFLiteConverter.from_keras_model(model)
tflite_model = converter.convert()

In [110]:
# Save the TensorFlow Lite model to a file
with open('model.tflite', 'wb') as f:
    f.write(tflite_model)

In [111]:
# Load the TensorFlow Lite model
interpreter = tf.lite.Interpreter(model_path="model.tflite")
interpreter.allocate_tensors()
# Get input and output details
input_details = interpreter.get_input_details()
output_details = interpreter.get_output_details()
# Print input shape
print("Input shape:", input_details[0]['shape'])
# Print output shape
print("Output shape:", output_details[0]['shape'])

Input shape: [   1 1500]
Output shape: [1 1]


In [112]:
def preprocess_comment(comment):
    comment = re.sub(r'http\S+|@[A-Za-z0-9]+|\d+|[^A-Za-z\s]', '', comment)
    comment = comment.lower()
    comment = contractions.fix(comment)
    tokens = comment.split()
    stop_words = ["a", "an", "the", "and", "or", "but", "is", "are", "am", "it", "this", "that", "of", "from", "in", "on"]
    tokens = [word for word in tokens if word not in stop_words]
    # Join tokens back into text
    processed_comment = ' '.join(tokens)
    return processed_comment

In [113]:
def predict_sentiment(comment):
    preprocessed_comment = preprocess_comment(comment)
    tokenizer = Tokenizer(num_words=1500)
    tokenizer.fit_on_texts([preprocessed_comment])
    one_hot_comment = tokenizer.texts_to_matrix([preprocessed_comment], mode='binary')
    padded_comment = pad_sequences(one_hot_comment, maxlen=1500)
    print("Shape of preprocessed comment:", padded_comment.shape)
    input_data = np.array(padded_comment, dtype=np.float32)
    interpreter.set_tensor(input_details[0]['index'], input_data)
    interpreter.invoke()
    output_data = interpreter.get_tensor(output_details[0]['index'])
    predicted_sentiment = int(output_data[0][0] > 0.5)
    return predicted_sentiment

In [114]:
# Example usage
#Return 1 for negative comment and 0 for positive comment
comment = "i dont love this team"
predicted_sentiment = predict_sentiment(comment)
print("Predicted sentiment:", predicted_sentiment)

Shape of preprocessed comment: (1, 1500)
Predicted sentiment: 1
