In [1]:
import numpy as np
import pandas as pd

# collection of machine learning algorithms
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
from sklearn.linear_model import LinearRegression
from sklearn.metrics import accuracy_score
from sklearn.metrics import confusion_matrix
# Common Model Helpers
from sklearn.preprocessing import LabelEncoder
from sklearn import metrics
from sklearn import model_selection
import pylab as pl
from sklearn.metrics import roc_curve
from sklearn.impute import SimpleImputer
from sklearn.metrics import roc_auc_score
from sklearn.metrics import r2_score
from sklearn.metrics import mean_squared_error
from scipy.stats import norm

pd.set_option("display.max_rows",10000)  # KISALTMA ENGELLEME
pd.set_option("display.max_columns",10000)  # KISALTMA ENGELLEME

train = pd.read_csv('train.csv')
test = pd.read_csv('test.csv') 

In [2]:
submission = pd.read_csv('sample_submission.csv') 

In [7]:
train.info()

train.head()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7613 entries, 0 to 7612
Data columns (total 5 columns):
id          7613 non-null int64
keyword     7552 non-null object
location    5080 non-null object
text        7613 non-null object
target      7613 non-null int64
dtypes: int64(2), object(3)
memory usage: 297.5+ KB


Unnamed: 0,id,keyword,location,text,target
0,1,,,Our Deeds are the Reason of this #earthquake M...,1
1,4,,,Forest fire near La Ronge Sask. Canada,1
2,5,,,All residents asked to 'shelter in place' are ...,1
3,6,,,"13,000 people receive #wildfires evacuation or...",1
4,7,,,Just got sent this photo from Ruby #Alaska as ...,1


In [3]:
train['text'] = train['text'].apply(lambda x: " ".join(x.lower() for x in x.split()))
test['text'] = test['text'].apply(lambda x: " ".join(x.lower() for x in x.split()))

In [4]:
train['text'] = train['text'].str.replace('[^\w\s]','')
test['text'] = test['text'].str.replace('[^\w\s]','')
train['text'] = train['text'].str.replace('[\d\_]','')
test['text'] = test['text'].str.replace('[\d\_]','')

In [5]:
import nltk
#nltk.download('stopwords')
from nltk.corpus import stopwords
sw = stopwords.words('english')

In [6]:
train['text'] = train['text'].apply(lambda x: " ".join(x for x in x.split() if x not in sw))
test['text'] = test['text'].apply(lambda x: " ".join(x for x in x.split() if x not in sw))

In [7]:
sil = pd.Series(' '.join(train['text']).split()).value_counts()[-50:]
sil = pd.Series(' '.join(test['text']).split()).value_counts()[-50:]

train['text'] = train['text'].apply(lambda x: " ".join(x for x in x.split() if x not in sil))
test['text'] = test['text'].apply(lambda x: " ".join(x for x in x.split() if x not in sil))

In [8]:
from textblob import Word
#nltk.download('wordnet')
train['text'] = train['text'].apply(lambda x: " ".join([Word(word).lemmatize() for word in x.split()])) 
test['text'] = test['text'].apply(lambda x: " ".join([Word(word).lemmatize() for word in x.split()])) 

In [9]:
train_x, test_x, train_y, test_y = model_selection.train_test_split(train["text"],
                                                                   train["target"], 
                                                                    random_state = 1)

In [51]:
train_x.head()


5242    sb new deepwater horizon oil spill distributio...
4537                rvacchianonydn surprise arent injured
6267    rt tonyhsieh person dance rain likely walk sto...
5486    yet another company trying censor internet red...
985     mattbez oh im bagging body bangin im saying sh...
Name: text, dtype: object

In [10]:
from sklearn.feature_extraction.text import TfidfVectorizer

In [11]:
tf_idf_word_vectorizer = TfidfVectorizer()
tf_idf_word_vectorizer.fit_transform(train_x)

<5709x16566 sparse matrix of type '<class 'numpy.float64'>'
	with 53936 stored elements in Compressed Sparse Row format>

In [12]:
test1 = test["text"]

In [13]:
test1_tf_idf_word = tf_idf_word_vectorizer.transform(test1)

In [99]:
test1_tf_idf_word.toarray()

array([[0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       ...,
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.]])

In [96]:
test1.head()

Unnamed: 0,text
0,happened terrible car crash
1,heard earthquake different city stay safe ever...
2,forest fire spot pond goose fleeing across str...
3,apocalypse lighting spokane wildfire
4,typhoon soudelor kill china taiwan
5,shakingits earthquake
6,theyd probably still show life arsenal yesterd...
7,hey
8,nice hat
9,fuck


In [14]:
x_train_tf_idf_word = tf_idf_word_vectorizer.transform(train_x)
x_test_tf_idf_word = tf_idf_word_vectorizer.transform(test_x)

In [15]:
np.shape(x_train_tf_idf_word)

(5709, 16566)

In [None]:
np.shape(x_train_tf_idf_word)

In [71]:
from sklearn.linear_model import LogisticRegression

In [73]:
loj = LogisticRegression()
loj_model = loj.fit(x_train_tf_idf_word,train_y)
accuracy = model_selection.cross_val_score(loj_model, 
                                           x_test_tf_idf_word, 
                                           test_y, 
                                           cv = 10).mean()

print("Word-Level TF-IDF Doğruluk Oranı:", accuracy)

Word-Level TF-IDF Doğruluk Oranı: 0.7636649214659686


In [75]:
from sklearn.naive_bayes import MultinomialNB

In [77]:
nb = MultinomialNB()
nb_model = nb.fit(x_train_tf_idf_word,train_y)
accuracy = model_selection.cross_val_score(nb_model, 
                                           x_test_tf_idf_word, 
                                           test_y, 
                                           cv = 10).mean()

print("Word-Level TF-IDF Doğruluk Oranı:", accuracy)

Word-Level TF-IDF Doğruluk Oranı: 0.767834114081014


In [100]:
y_pred = nb_model.predict(test1_tf_idf_word)

In [101]:
print(y_pred)

[1 0 1 ... 1 1 1]


In [104]:
submission= submission[["id"]]

In [110]:
submission.head(10)

Unnamed: 0,id
0,0
1,2
2,3
3,9
4,11
5,12
6,21
7,22
8,27
9,29


In [112]:
y_pred1 = pd.DataFrame(data = y_pred, index = range(3263), columns=['target'])

In [114]:
final_submission=pd.concat([submission,y_pred1],axis=1)

In [116]:
final_submission.to_csv('05062020_1.csv',index=False)

In [117]:
final_submission.head()

Unnamed: 0,id,target
0,0,1
1,2,0
2,3,1
3,9,1
4,11,1
5,12,1
6,21,0
7,22,0
8,27,0
9,29,0
