In [19]:
import os
import pickle
import joblib
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import re
import string
import nltk
import warnings
from nltk.stem.porter import PorterStemmer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split
import xgboost as xgb
from sklearn.metrics import f1_score, accuracy_score
from wordcloud import WordCloud

%matplotlib inline
warnings.filterwarnings('ignore')

# Load dataset
df = pd.read_csv('Twitter_Sentiments.csv')

# Define functions for text preprocessing
def remove_pattern(input_txt, pattern):
    r = re.findall(pattern, input_txt)
    for word in r:
        input_txt = re.sub(word, "", input_txt)
    return input_txt

def remove_emojis(text):
    emoji_pattern = re.compile(
        "["
        u"\U0001F600-\U0001F64F"  # emoticons
        u"\U0001F300-\U0001F5FF"  # symbols & pictographs
        u"\U0001F680-\U0001F6FF"  # transport & map symbols
        u"\U0001F1E0-\U0001F1FF"  # flags (iOS)
        u"\U00002702-\U000027B0"
        u"\U000024C2-\U0001F251"
        "]+", flags=re.UNICODE)
    return emoji_pattern.sub(r'', text)

# Apply preprocessing
df['Cleaned_Tweet'] = np.vectorize(remove_pattern)(df['tweet'], "@[\w]*")
df['Cleaned_Tweet'] = df['Cleaned_Tweet'].str.replace("[^a-zA-Z#]", " ")
df['Cleaned_Tweet'] = df['Cleaned_Tweet'].apply(lambda x: " ".join([w for w in x.split() if len(w) > 3]))
df['Cleaned_Tweet'] = df['Cleaned_Tweet'].apply(remove_emojis)
df['Cleaned_Tweet'] = df['Cleaned_Tweet'].apply(lambda x: x.encode('ascii', 'ignore').decode('ascii'))
df['Cleaned_Tweet'] = df['Cleaned_Tweet'].apply(lambda x: " ".join([w for w in x.split() if len(w) > 3]))

# Stemming
stemmer = PorterStemmer()
tokenized_tweet = df['Cleaned_Tweet'].apply(lambda x: x.split())
tokenized_tweet = tokenized_tweet.apply(lambda sentence: [stemmer.stem(word) for word in sentence])
for i in range(len(tokenized_tweet)):
    tokenized_tweet[i] = " ".join(tokenized_tweet[i])
df['Cleaned_Tweet'] = tokenized_tweet

# Feature extraction
bow_vectorizer = CountVectorizer(max_df=0.90, min_df=2, max_features=1000, stop_words='english')
bow = bow_vectorizer.fit_transform(df['Cleaned_Tweet'])


# Split data
x_train, x_test, y_train, y_test = train_test_split(bow, df['label'], random_state=42, test_size=0.25)

# Train the model

xgb_model = xgb.XGBClassifier(use_label_encoder=False, eval_metric='logloss')
xgb_model.fit(x_train, y_train)

xgb_model.get_booster().save_model('xgb_model.json')
joblib.dump(bow_vectorizer, 'bow_vectorizer.joblib')

# Evaluate the model
pred = xgb_model.predict(x_test)
f1 = f1_score(y_test, pred)
accuracy = accuracy_score(y_test, pred)

# Save the vectorizer


# Function to preprocess new reviewIt's waste of money
def preprocess_review(review):
    review = remove_pattern(review, "@[\w]*")
    review = re.sub("[^a-zA-Z#]", " ", review)
    review = remove_emojis(review)
    review = review.encode('ascii', 'ignore').decode('ascii')
    review = " ".join([w for w in review.split() if len(w) > 3])
    review = " ".join([stemmer.stem(w) for w in review.split()])
    return review

# Function to predict sentiment of a new review
def predict_sentiment(review):
    processed_review = preprocess_review(review)
    review_bow = bow_vectorizer.transform([processed_review])
    prediction = xgb_model.predict(review_bow)
    return 'positive' if prediction == 0 else 'negative'



In [17]:
import pickle
pickle.dump(
    xgb_model,open('regmodel.pkl','wb')
)

In [18]:
pickled_model=pickle.load(open('regmodel.pkl','rb'))