In [1]:
import numpy as np # linear algebra
import pandas as pd
import re
import tensorflow as tf
import string
import matplotlib.pyplot as plt

In [3]:
train_data = pd.read_csv('train.csv', usecols=['id','text','target'])
test_data = pd.read_csv('test.csv', usecols=['id','text'])

In [4]:
train_data.shape

(7613, 3)

In [5]:
train_data = train_data.drop_duplicates(subset=['text', 'target'], keep='first')
train_data.shape

(7521, 3)

In [6]:
print(train_data.target.value_counts())

0    4315
1    3206
Name: target, dtype: int64


In [7]:
train_data.head(5)

Unnamed: 0,id,text,target
0,1,Our Deeds are the Reason of this #earthquake M...,1
1,4,Forest fire near La Ronge Sask. Canada,1
2,5,All residents asked to 'shelter in place' are ...,1
3,6,"13,000 people receive #wildfires evacuation or...",1
4,7,Just got sent this photo from Ruby #Alaska as ...,1


In this competition, we’re challenged to build a machine learning model that predicts which Tweets are about real disasters and which one’s aren’t. You’ll have access to a dataset of 10,000 tweets that were hand classified

## text preprocessing

Preprocessing cleaning the text data 

In [8]:
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem.porter import PorterStemmer

In [9]:
raw_txt = train_data.text.values

In [42]:
## clean function
pstem = PorterStemmer()

def remove_urls (vTEXT):
    vTEXT = re.sub(r'(https|http)?:\/\/(\w|\.|\/|\?|\=|\&|\%)*\b', '', vTEXT, flags=re.MULTILINE)
    return(vTEXT)

def deEmojify(text):
    regrex_pattern = re.compile(pattern = "["
        u"\U0001F600-\U0001F64F"  # emoticons
        u"\U0001F300-\U0001F5FF"  # symbols & pictographs
        u"\U0001F680-\U0001F6FF"  # transport & map symbols
        u"\U0001F1E0-\U0001F1FF"  # flags (iOS)
                           "]+", flags = re.UNICODE)
    return regrex_pattern.sub(r'',text)

def preprocess(tweet):
    tweet = deEmojify(tweet)
    tweet = tweet.lower()
    tweet = tweet.rstrip()
    tweet = remove_urls(tweet)
    tweet = re.sub(r'&amp;', 'and', tweet)
    tweet = re.sub(r'(@.*?)[\s]', ' ', tweet)
    tweet = "".join([char for char in tweet if char not in string.punctuation])
    tweet = re.sub('[0-9]', '', tweet)
    tweet = " ".join([word for word in tweet.split()
                  if word not in stopwords.words('english')
                  or word in ['not', 'can']])
    tweet = tweet.strip()
    tokens = word_tokenize(tweet)
    tokens=[pstem.stem(word) for word in tokens]
    tweet = ' '.join(tokens)
    
    return tweet
    
    

In [11]:
count = 0
numbers = []
for tweet in raw_txt:
    if any(char.isdigit() for char in tweet):
        count += 1
    numbers.append(any(char.isdigit() for char in tweet))
        
idx = []
for i, bol in enumerate(numbers):
    if bol == True:
        idx.append(i)

In [12]:
raw_txt[1612]

"Greece's tax revenues collapse as debt crisis continues\nhttp://t.co/uxp6PoqjLb"

In [13]:
clean_txt = []
for tweet in raw_txt:
    tweet = preprocess(tweet)
    clean_txt.append(tweet)
    

In [14]:
clean_txt = np.array(clean_txt)

count = 0
numbers = []
for tweet in clean_txt:
    if any(char.isdigit() for char in tweet):
        count += 1
    numbers.append(any(char.isdigit() for char in tweet))
        
idx = []
for i, bol in enumerate(numbers):
    if bol == True:
        idx.append(i)
        
count

0

### model using sklearn / Logistic Regression

employ various models to clean text

In [29]:
from sklearn import preprocessing
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
from sklearn.feature_extraction.text import CountVectorizer,TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix as cm

In [21]:
y = train_data.target
X_train, X_val, y_train, y_val = train_test_split(clean_txt, y, test_size=0.2, random_state=2021)

### TF-IDF + Logistic regression

In [22]:
# Calculate TF-IDF
tf_idf = TfidfVectorizer(ngram_range=(1, 3), binary=True, smooth_idf=False)
X_train_tfidf = tf_idf.fit_transform(X_train)
X_val_tfidf = tf_idf.transform(X_val)

X_train_tfidf.shape

(6016, 81657)

In [24]:
clf = LogisticRegression(C=1)
clf.fit(X_train_tfidf, y_train)

LogisticRegression(C=1, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=100,
                   multi_class='auto', n_jobs=None, penalty='l2',
                   random_state=None, solver='lbfgs', tol=0.0001, verbose=0,
                   warm_start=False)

In [25]:
Y_test_pred = clf.predict(X_val_tfidf)
accuracy_score(y_val, Y_test_pred)

0.8033222591362126

In [32]:
cm(y_val, Y_test_pred, labels=[0, 1])

array([[775,  71],
       [225, 434]])

In [46]:
test_txt = []
for tweet in test_data.text.values:
    tweet = preprocess(tweet)
    test_txt.append(tweet)

X_test_tfidf = tf_idf.transform(test_txt)
pred = clf.predict(X_test_tfidf)

In [47]:
submision1 = pd.DataFrame()
submision1['id'] = test_data.id.values
submision1['target'] = pred

In [50]:
submision1.to_csv("submission.csv",index=False, header=True)

### Logistic regression with tensorflow

In [38]:
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow import keras
from keras.models import Sequential
from keras.layers import Dense, Embedding, LSTM,GRU, Dropout, Activation, Input, Flatten, Bidirectional, Conv1D, MaxPooling1D
from tensorflow.keras import layers
from keras import optimizers

from tensorflow.keras import layers
from tensorflow.keras import losses
from tensorflow.keras import preprocessing
from tensorflow.keras.layers.experimental.preprocessing import TextVectorization

In [39]:
print(tf.__version__)

2.4.0


### Using Bert

In [115]:
import torch

if torch.cuda.is_available():       
    device = torch.device("cuda")
    print(f'There are {torch.cuda.device_count()} GPU(s) available.')
    print('Device name:', torch.cuda.get_device_name(0))

else:
    print('No GPU available, using the CPU instead.')
    device = torch.device("cpu")


No GPU available, using the CPU instead.
