## Importing Cleaned Data

In [1]:
import pandas as pd

df = pd.read_csv('20221202 Processed Data.csv')
df.tail()

Unnamed: 0,id,keyword,location,text,target
11365,11365,wrecked,Blue State in a red sea,Media should have warned us well in advance. T...,0
11366,11366,wrecked,arohaonces,i feel directly attacked 💀 i consider moonbin ...,0
11367,11367,wrecked,🇵🇭,i feel directly attacked 💀 i consider moonbin ...,0
11368,11368,wrecked,auroraborealis,"ok who remember ""outcast"" nd the ""dora"" au?? T...",0
11369,11369,wrecked,,Jake Corway wrecked while running 14th at IRP.,1


## Apply Preprocessing

In [2]:
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
from nltk.tokenize import word_tokenize

stop_words=set(stopwords.words("english"))
porter = PorterStemmer()

def preprocess(texts):
    for n in range(0, len(texts)):
        text = texts[n]
        text = word_tokenize(text)
        text = [word.lower() for word in text]
        text = [word for word in text if word not in stop_words]
        text = [word for word in text if word.isalpha()]
        text = [porter.stem(word) for word in text]
        text = " ".join(text)
        texts[n] = text
    return texts

In [4]:
df['Text_new'] = preprocess(list(df['text']))

## Count Vectorizer

In [5]:
df['text']

0        Communal violence in Bhainsa, Telangana. "Ston...
1        Telangana: Section 144 has been imposed in Bha...
2        Arsonist sets cars ablaze at dealership https:...
3        Arsonist sets cars ablaze at dealership https:...
4        "Lord Jesus, your love brings freedom and pard...
                               ...                        
11365    Media should have warned us well in advance. T...
11366    i feel directly attacked 💀 i consider moonbin ...
11367    i feel directly attacked 💀 i consider moonbin ...
11368    ok who remember "outcast" nd the "dora" au?? T...
11369       Jake Corway wrecked while running 14th at IRP.
Name: text, Length: 11370, dtype: object

In [6]:
from sklearn.feature_extraction.text import CountVectorizer
vect = CountVectorizer()
vect.fit(list(df['text']))

CountVectorizer()

In [7]:
vect.get_feature_names_out()

array(['00', '000', '00009', ..., '𝗠𝗔𝗬', '𝗣𝗲𝘁𝗿𝗼𝘁𝗲𝗾', '𝗳𝗼𝗿'], dtype=object)

In [8]:
simple_train_dtm = vect.transform(df['text'])
pd.DataFrame(simple_train_dtm.toarray(), columns=vect.get_feature_names_out())

Unnamed: 0,00,000,00009,000ft,000kg,007,0089,00am,00pm,00u3qm1ucs,...,𝐲𝐨𝐮,𝒋𝒊𝒍𝒍,𝒗𝒂𝒍𝒆𝒏𝒕𝒊𝒏𝒆,𝗖𝗢𝗥𝗧,𝗘𝗻𝗲𝗿𝗴𝘆,𝗚𝗶𝘃𝗲𝗮𝘄𝗮𝘆,𝗜𝗻𝗰,𝗠𝗔𝗬,𝗣𝗲𝘁𝗿𝗼𝘁𝗲𝗾,𝗳𝗼𝗿
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
11365,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
11366,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
11367,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
11368,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


## Split columns to independent and dependent variables
### independent variables is turned into tf-idf 

In [10]:
# MODEL
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.feature_selection._univariate_selection import SelectKBest, chi2
from sklearn.naive_bayes import MultinomialNB
from sklearn import metrics

vectorizer = TfidfVectorizer()
X = vectorizer.fit_transform(df['text'])
Y = df['target']
# END MODEL

In [11]:
X.toarray()

array([[0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       ...,
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.]])

## Split data into test and train

In [12]:
# SPLIT INTO TEST AND TRAIN
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, 
                                                    Y, 
                                                    test_size=0.3, 
                                                    random_state=42)
# END SPLIT

## Applying models and predict

In [16]:
# FIT AND EVALUATE MODEL
from sklearn.neighbors._classification import KNeighborsClassifier
from sklearn.svm._classes import LinearSVC

ch2 = SelectKBest(chi2, k=1000)
X_train = ch2.fit_transform(X_train, y_train)
X_test = ch2.transform(X_test)

clf = MultinomialNB()
# clf = LinearSVC()
# clf = KNeighborsClassifier()
clf.fit(X_train, y_train)
y_pred = clf.predict(X_test)
print("accuracy: %0.3f" % metrics.accuracy_score(y_test, y_pred))
print(metrics.confusion_matrix(y_test, y_pred))
print(metrics.classification_report(y_test, y_pred, target_names=['Disaster', 'No Disaster']))
# END EVALUATE

accuracy: 0.848
[[2805    3]
 [ 516   87]]
              precision    recall  f1-score   support

    Disaster       0.84      1.00      0.92      2808
 No Disaster       0.97      0.14      0.25       603

    accuracy                           0.85      3411
   macro avg       0.91      0.57      0.58      3411
weighted avg       0.87      0.85      0.80      3411



In [18]:
# FIT AND EVALUATE MODEL
from sklearn.neighbors._classification import KNeighborsClassifier
from sklearn.svm._classes import LinearSVC

ch2 = SelectKBest(chi2, k=1000)
X_train = ch2.fit_transform(X_train, y_train)
X_test = ch2.transform(X_test)

# clf = MultinomialNB()
clf = LinearSVC()
# clf = KNeighborsClassifier()
clf.fit(X_train, y_train)
y_pred = clf.predict(X_test)
print("accuracy: %0.3f" % metrics.accuracy_score(y_test, y_pred))
print(metrics.confusion_matrix(y_test, y_pred))
print(metrics.classification_report(y_test, y_pred, target_names=['Disaster', 'No Disaster']))
# END EVALUATE

accuracy: 0.889
[[2748   60]
 [ 320  283]]
              precision    recall  f1-score   support

    Disaster       0.90      0.98      0.94      2808
 No Disaster       0.83      0.47      0.60       603

    accuracy                           0.89      3411
   macro avg       0.86      0.72      0.77      3411
weighted avg       0.88      0.89      0.88      3411



In [19]:
# FIT AND EVALUATE MODEL
from sklearn.neighbors._classification import KNeighborsClassifier
from sklearn.svm._classes import LinearSVC

ch2 = SelectKBest(chi2, k=1000)
X_train = ch2.fit_transform(X_train, y_train)
X_test = ch2.transform(X_test)

# clf = MultinomialNB()
# clf = LinearSVC()
clf = KNeighborsClassifier()
clf.fit(X_train, y_train)
y_pred = clf.predict(X_test)
print("accuracy: %0.3f" % metrics.accuracy_score(y_test, y_pred))
print(metrics.confusion_matrix(y_test, y_pred))
print(metrics.classification_report(y_test, y_pred, target_names=['Disaster', 'No Disaster']))
# END EVALUATE

accuracy: 0.850
[[2790   18]
 [ 494  109]]
              precision    recall  f1-score   support

    Disaster       0.85      0.99      0.92      2808
 No Disaster       0.86      0.18      0.30       603

    accuracy                           0.85      3411
   macro avg       0.85      0.59      0.61      3411
weighted avg       0.85      0.85      0.81      3411

