## NLP Social Media Data Example

### Imports

In [1]:
import nltk
import numpy as np
import pandas as pd
from nltk.sentiment.vader import SentimentIntensityAnalyzer
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, classification_report

import warnings
warnings.filterwarnings('ignore')

In [2]:
# Sample social media data 
data = pd.read_csv('social_media_data.csv')
data.head()

Unnamed: 0,text
0,I just tried the new restaurant in town and it...
1,Disappointed with the service at the hotel. Th...
2,Loving the weather today! Perfect for outdoor ...
3,The customer support team was incredibly helpf...
4,Got stuck in traffic for hours. Not a great st...


### Preprocessing
The function `preprocess_text` takes a text input, tokenizes it into words, converts them to lowercase, removes stopwords and non-alphanumeric tokens, and then lemmatizes each token joining the lemmatized tokens back into a string. It then creates a new column 'clean_text' in the Dataframe containing the preprocessed text.

In [3]:
stop_words = set(stopwords.words('english'))
lemmatizer = WordNetLemmatizer()

def preprocess_text(text):
    tokens = word_tokenize(text.lower())
    filtered_tokens = [token for token in tokens if token.isalnum() and token not in stop_words]
    lemmatized_tokens = [lemmatizer.lemmatize(token) for token in filtered_tokens]
    return ' '.join(lemmatized_tokens)

data['clean_text'] = data['text'].apply(preprocess_text)

### Calculating Sentiment

In [4]:
sid = SentimentIntensityAnalyzer()

data['sentiment_scores'] = data['clean_text'].apply(lambda x: sid.polarity_scores(x))

data['compound_sentiment'] = data['sentiment_scores'].apply(lambda x: x['compound'])

data['sentiment'] = data['compound_sentiment'].apply(lambda x: 'positive' if x >= 0 else 'negative')

In [5]:
data.head()

Unnamed: 0,text,clean_text,sentiment_scores,compound_sentiment,sentiment
0,I just tried the new restaurant in town and it...,tried new restaurant town fantastic,"{'neg': 0.0, 'neu': 0.526, 'pos': 0.474, 'comp...",0.5574,positive
1,Disappointed with the service at the hotel. Th...,disappointed service hotel need improve,"{'neg': 0.344, 'neu': 0.333, 'pos': 0.322, 'co...",-0.0516,negative
2,Loving the weather today! Perfect for outdoor ...,loving weather today perfect outdoor activity,"{'neg': 0.0, 'neu': 0.345, 'pos': 0.655, 'comp...",0.8225,positive
3,The customer support team was incredibly helpf...,customer support team incredibly helpful great...,"{'neg': 0.0, 'neu': 0.282, 'pos': 0.718, 'comp...",0.8799,positive
4,Got stuck in traffic for hours. Not a great st...,got stuck traffic hour great start day,"{'neg': 0.18, 'neu': 0.45, 'pos': 0.369, 'comp...",0.4767,positive


### Feature extraction
We create the X using a TF-IDF vectorizer transforming the newly created 'clean_text' column into numerical feature vectors. We also create the y using the 'sentiment' column.

In [6]:
tfidf_vectorizer = TfidfVectorizer(max_features=1000)
X = tfidf_vectorizer.fit_transform(data['clean_text'])
y = data['sentiment']

### Split data into training and testing sets
We split the data into 80% training and 20% testing.

In [7]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

### Train a classifier
Train a classifier using a Support Vector Machine (SVM) classifier.

In [8]:
classifier = SVC(kernel='linear')
classifier.fit(X_train, y_train)

### Predictions

In [9]:
y_pred = classifier.predict(X_test)

### Evaluation

In [10]:
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy:", accuracy)
print("Classification Report:")
print(classification_report(y_test, y_pred))

Accuracy: 0.75
Classification Report:
              precision    recall  f1-score   support

    negative       0.00      0.00      0.00         1
    positive       0.75      1.00      0.86         3

    accuracy                           0.75         4
   macro avg       0.38      0.50      0.43         4
weighted avg       0.56      0.75      0.64         4

