# **NLP DISASTER TWEET**

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import nltk
import re
import os
import nltk

from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import confusion_matrix, classification_report
from sklearn.metrics import precision_score, recall_score, f1_score, accuracy_score
from sklearn.metrics import accuracy_score
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.tokenize import TweetTokenizer
from nltk.stem import WordNetLemmatizer
from nltk.stem import PorterStemmer
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
nltk.download("stopwords")
nltk.download("punkt")

# **LOAD DATA**

In [None]:
df = pd.read_csv("/train.csv")
test=pd.read_csv('/test.csv')

In [None]:
# Number of rows and columns
num_rows, num_cols = df.shape
df.head()
df.info()

# Data types of each column
data_types = df.dtypes

# Summary statistics
summary_stats = df.describe()

# Missing values
missing_values = df.isnull().sum()

# Unique values in each column
unique_values = df.nunique()
print(num_rows,num_cols)
print('types=',data_types)
print('stats=',summary_stats)
print('missing values=',missing_values)
print('unique_values',unique_values)

# **Data Preprocessing**

In [None]:
# Remove URLs and special characters using a for loop
cleaned_text_list = []
for text in df['text']:
    text=text.lower()
    # Remove URLs
    text = re.sub(r'http\S+|www\S+', '', text)
    # Remove "@" symbol
    text = text.replace('@', '')

    # Remove special characters (keep only alphanumeric characters and whitespace)
    text = re.sub(r'[^a-zA-Z\s]', '', text)

    # Tokenize the text
    tokens = word_tokenize(text)

    # Remove stopwords
    stop_words = set(stopwords.words('english'))
    filtered_tokens = [token for token in tokens if token.lower() not in stop_words]
    # Uncomment either the stemming or lemmatization section based on your choice
    # Stemming
    stemmer = PorterStemmer()
    stemmed_tokens = [stemmer.stem(token) for token in filtered_tokens]
    cleaned_text_list.append(' '.join(stemmed_tokens))

# Add the cleaned_text_list as a new column in the DataFrame
df['cleaned_text'] = cleaned_text_list
# data split
x=df['cleaned_text']
y=df['target']
x

# **Feature Extraction**

In [None]:
tfidf_vectorizer = TfidfVectorizer()
X= tfidf_vectorizer.fit_transform(df['cleaned_text'])
# Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.2, random_state=42)

# **Classification**


In [None]:
cl_svc=SVC(kernel='linear')
cl_svc.fit(X_train, y_train)

cl_lr=LogisticRegression()
cl_lr.fit(X_train,y_train)

cl_knn = KNeighborsClassifier(n_neighbors=2)
cl_knn.fit(X_train, y_train)

cl_svc_y=cl_svc.predict(X_test)
cl_lr_y=cl_lr.predict(X_test)
cl_knn_y=cl_knn.predict(X_test)

svc_accuracy=accuracy_score(cl_svc_y,y_test)
lr_accuracy=accuracy_score(cl_lr_y,y_test)
knn_accuracy=accuracy_score(cl_knn_y,y_test)

print('svm:',classification_report (y_test,cl_svc_y))
print('lr:',classification_report (y_test,cl_lr_y))
print('knn',classification_report (y_test,cl_knn_y))

train_svm_accuracy=cl_svc.score(X_train, y_train)
train_lr_accuracy=cl_lr.score(X_train, y_train)
train_knn_accuracy=cl_knn.score(X_train, y_train)

print("svc_Accuracy:",svc_accuracy )
#print("train_svm_Accuracy:", train_svm_accuracy)
print("lr_Accuracy:",lr_accuracy )
#print("train_lr_Accuracy:", train_lr_accuracy)
print("knn_Accuracy:",knn_accuracy )
#print("train_knn_Accuracy:", train_knn_accuracy)

# **Test Data Preprocessing**


In [None]:
# Remove URLs and special characters using a for loop
cleaned_test_list = []
for text in test['text']:
    text=text.lower()
    # Remove URLs
    text = re.sub(r'http\S+|www\S+', '', text)

    # Remove "@" symbol
    text = text.replace('@', '')

    # Remove special characters (keep only alphanumeric characters and whitespace)
    text = re.sub(r'[^a-zA-Z\s]', '', text)

    # Tokenize the text
    tokens = word_tokenize(text)

    # Remove stopwords
    stop_words = set(stopwords.words('english'))
    filtered_tokens = [token for token in tokens if token.lower() not in stop_words]

    # Stemming
    stemmer = PorterStemmer()
    stemmed_tokens = [stemmer.stem(token) for token in filtered_tokens]
    cleaned_test_list.append(' '.join(stemmed_tokens))

# Add the cleaned_test_list as a new column in the DataFrame
test['cleaned_text'] = cleaned_test_list
test_x = tfidf_vectorizer.transform(test['cleaned_text'])

# **Predicting**

In [None]:
test_pred_svc=cl_svc.predict(test_x)
test_pred_lr=cl_lr.predict(test_x)
test_pred_knn=cl_knn.predict(test_x)

# **Result**

In [None]:
submissions = pd.read_csv(r"/sample_submission.csv")
submissions['target']=test_pred_svc
submissions.to_csv("submissions.csv", index=False)