In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

# **loading data**

In [None]:
train=pd.read_csv('/kaggle/input/sentimental-analysis-for-tweets/sentiment_tweets3.csv')
print(f'Train data shape: {train.shape}')

train.head()

# **EDA**

In [None]:
#check duplication
train.duplicated().sum()

In [None]:
# Missing values check
print(f'Missing values in train data:\n{train.isnull().sum()}')


**no duplicates data , no missing values!!**

In [None]:
train.info()

In [None]:
train['label (depression result)'].value_counts()

In [None]:
train['label (depression result)'].value_counts().plot(kind='bar')

In [None]:
#plot
import matplotlib.pyplot as plt
# NLP
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from wordcloud import WordCloud, STOPWORDS
import re

# Warning
import warnings
warnings.filterwarnings('ignore')

In [None]:
# Plotting wordclouds for both negative and positive tweets
stopwords = set(STOPWORDS)

# Removing 'user' word as it does not hold any importance in our context
stopwords.add('user')        

depression_tweets = train['message to examine'][train['label (depression result)']==1].to_string()
wordcloud_depression = WordCloud(width = 800, height = 800, 
                               background_color ='white', stopwords = stopwords,
                               min_font_size = 10).generate(depression_tweets)

not_depression_tweets = train['message to examine'][train['label (depression result)']==0].to_string()
wordcloud_not_depression = WordCloud(width = 800, height = 800, 
                               background_color ='white', stopwords = stopwords,
                               min_font_size = 10).generate(not_depression_tweets)
 
# Plotting the WordCloud images                     
plt.figure(figsize=(14, 6), facecolor = None)

plt.subplot(1, 2, 1)
plt.imshow(wordcloud_depression)
plt.axis("off")
plt.title('depression_tweets', fontdict={'fontsize': 20})

plt.subplot(1, 2, 2)
plt.imshow(wordcloud_not_depression)
plt.axis("off")
plt.title('not_depression_tweets', fontdict={'fontsize': 20})

plt.tight_layout() 
plt.show()

# **Data Preprocessing**

In [None]:
train['message_cleaned']=train['message to examine'].str.lower()
train.head()

**stop word**

In [None]:
import nltk
nltk.download('stopwords')

from nltk.corpus import stopwords
stop_words = set(stopwords.words('english'))

def remove_stopwords(text):
    words = [word.lower() for word in text.split() if word.lower() not in stop_words]
    return " ".join(words)

train['message_cleaned'] = train['message_cleaned'].apply(remove_stopwords)
train.head()

In [None]:
#Cleaning URLs
def clean_url(data):
    data=re.sub(r"((https:|http|ftp)?(:\/\/)?(www\.)?)?[-a-zA-Z0-9@:%._\+~#=]{1,256}\.[a-zA-Z0-9()]{1,6}\b([-a-zA-Z0-9()@:%_\+.~#?&\/\/=]*)",' ',data)
    return re.sub(r'/', ' / ', data)
train['message_cleaned'] = train['message_cleaned'].apply(lambda x: clean_url(x))
train.head()

In [None]:
#Cleaning Punctuations
import string
def clean_punctuations(text):
    return text.translate(str.maketrans('', '', string.punctuation))
train['message_cleaned']= train['message_cleaned'].apply(lambda x: clean_punctuations(x))
train.head()


In [None]:
#repeating_char
def clean_repeating_char(text):
    return re.sub(r"(.)\1\1+", r"\1\1", text)
train['message_cleaned'] = train['message_cleaned'].apply(lambda x: clean_repeating_char(x))
train.head()

In [None]:
#Cleaning Numbers
def clean_numbers(data):
    return re.sub('[0-9]+', '', data)
train['message_cleaned'] = train['message_cleaned'].apply(lambda x: clean_numbers(x))
train.head()

In [None]:
#remove_hashtag
def remove_hashtag(data):
    return re.sub('#[\w\d]+',' ' ,data)
train['message_cleaned'] = train['message_cleaned'].apply(lambda x: remove_hashtag(x))
train.head()

In [None]:
def clean_username(data):
    return re.sub('@[^\s]+',' ', data)
train['message_cleaned'] = train['message_cleaned'].apply(lambda x: clean_username(x))
train.head()


In [None]:
def clean_emoji(data):    
    data = re.sub(r'<3', '<heart>', data)
    data = re.sub(r"[8:=;]['`\-]?[)d]+", '<smile>', data)
    data = re.sub(r"[8:=;]['`\-]?\(+", '<sad>', data)
    data = re.sub(r"[8:=;]['`\-]?[\/|l*]", '<neutral>', data)
    data = re.sub(r"[8:=;]['`\-]?p+", '<laugh>', data)
    return data
train['message_cleaned'] = train['message_cleaned'].apply(lambda x: clean_emoji(x))
train.head()

In [None]:
def remove_images(tweet):
    cleaned_tweet = re.sub(r"pic\.twitter\.com/\S+",'', tweet)
    cleaned_tweet = re.sub("\w+(\.png|\.jpg|\.gif|\.jpeg)", " ", cleaned_tweet)
    return cleaned_tweet

train["message_cleaned"] = train["message_cleaned"].apply(remove_images)
train.head(10)

In [None]:
train.head()

In [None]:
pip install language_tool_python

In [None]:
#pip install spellchecker

In [None]:
import language_tool_python
#from spellchecker import SpellChecker
import nltk
from nltk.tokenize import word_tokenize

# Initialize the spell checker
#spell = SpellChecker()
tool = language_tool_python.LanguageTool('en-US')


In [None]:
train['corrected_tweets'] = train.apply(lambda l: tool.correct(l['message_cleaned']), axis=1)

In [None]:
train.head(10)

In [None]:
train.isnull().sum()

In [None]:
train.shape

In [None]:
from nltk.stem import PorterStemmer        
from nltk.tokenize import TweetTokenizer

**Lemmatization**

In [None]:
nltk.download('punkt')
nltk.download('wordnet', '/root/nltk_data/')
!unzip /root/nltk_data/corpora/wordnet.zip -d /root/nltk_data/corpora/
from nltk.stem.wordnet import WordNetLemmatizer
from nltk.stem import PorterStemmer

In [None]:
lemmatizer = nltk.stem.wordnet.WordNetLemmatizer()

def lemmatizeRows(text):
    return " ".join([lemmatizer.lemmatize(word) for word in text.split()])

train['lemmatizedRows'] = train['corrected_tweets'].apply(lemmatizeRows)

train.head()

**stemming**

In [None]:
stemmer = PorterStemmer()

def stemRows(text):
    return " ".join([stemmer.stem(word) for word in text.split()])

train['stemmedRows'] = train['lemmatizedRows'].apply(stemRows)

train.head()

**TF-IDF Vectorization**

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer

Vectorizer = TfidfVectorizer()

dataVectorized = Vectorizer.fit_transform(train['corrected_tweets'])

dataVectorized.toarray()

In [None]:
dataVectorized.toarray().shape


**Splitting the data**

In [None]:
from sklearn.model_selection import train_test_split

X_train, X_test ,y_train ,y_test = train_test_split(dataVectorized, train['label (depression result)'], test_size=0.2,
                                                    stratify= train['label (depression result)']  , random_state=0)

In [None]:
X_train.shape , X_test.shape ,y_train.shape ,y_test.shape 

In [None]:
from sklearn.utils import compute_class_weight

classWeights = compute_class_weight("balanced", classes = np.unique(train['label (depression result)']),
                                     y = train['label (depression result)'])

classWeights = dict(zip(np.unique(np.unique(train['label (depression result)'])), classWeights))

print(classWeights)

**Modelling**

In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import MultinomialNB
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import LinearSVC

In [None]:
lr = LogisticRegression(class_weight=classWeights, random_state=0)

nb = MultinomialNB()

dt = DecisionTreeClassifier(class_weight=classWeights, random_state=0)

rf = RandomForestClassifier(class_weight=classWeights, random_state=0)

#svc = LinearSVC(class_weight=classWeights, random_state=0)

In [None]:
lr.fit(X_train.toarray(), y_train)

nb.fit(X_train.toarray(), y_train)

dt.fit(X_train.toarray(), y_train)

rf.fit(X_train.toarray(), y_train)

#svc.fit(X_train.toarray(), y_train)

In [None]:
lrScore = lr.score(X_train.toarray(), y_train)

nbScore = nb.score(X_train.toarray(), y_train)

dtScore = dt.score(X_train.toarray(), y_train)

rfScore = rf.score(X_train.toarray(), y_train)

In [None]:
colors = plt.cm.viridis(np.linspace(0, 1, 4))

models = ["Logistic Regression","Naive Bayes","Decision Trees", "Random Forests"]

In [None]:
plt.barh(models, width = [lrScore, nbScore, dtScore, rfScore],
        color = colors)

plt.title("Train Accuaracy Comparisons")

print(f'Logistic Regression Train Accuracy score is : {round(lrScore * 100 ,2)}%')

print(f'\nNaive Bayes Train Accuracy score is : {round(nbScore * 100 , 2)}%')

print(f'\nDecision Tree Train Accuracy score is : {round(dtScore * 100 , 2)}%')

print(f'\nRandom Forest Train Accuracy score is : {round(rfScore * 100 , 2)}%')

In [None]:
lrScore = lr.score(X_test.toarray(), y_test)

nbScore = nb.score(X_test.toarray(), y_test)

dtScore = dt.score(X_test.toarray(), y_test)

rfScore = rf.score(X_test.toarray(), y_test)


In [None]:
plt.barh(models, width = [lrScore, nbScore, dtScore, rfScore],
        color = colors)

plt.xlabel("Machine Learning Models")


print(f'Logistic Regression Test Accuracy score is : {round(lrScore * 100 ,2)}%')

print(f'\nNaive Bayes Test Accuracy score is : {round(nbScore * 100 , 2)}%')

print(f'\nDecision Tree Test Accuracy score is : {round(dtScore * 100 , 2)}%')

print(f'\nRandom Forest Test Accuracy score is : {round(rfScore * 100 , 2)}%')

**Evaluating Models**

In [None]:
from sklearn.metrics import confusion_matrix, accuracy_score, classification_report, roc_auc_score

In [None]:
lrPreds = lr.predict(X_test.toarray())

nbPreds = nb.predict(X_test.toarray())

dtPreds = dt.predict(X_test.toarray())

rfPreds = rf.predict(X_test.toarray())

In [None]:
lrScore = roc_auc_score(lrPreds, y_test)

nbScore = roc_auc_score(nbPreds, y_test)

dtScore = roc_auc_score(dtPreds, y_test)

rfScore = roc_auc_score(rfPreds, y_test)

In [None]:
plt.barh(models, width = [lrScore, nbScore, dtScore, rfScore],
        color = colors)

plt.xlabel("Machine Learning Models")


print(f'Logistic Regression Test Roc Auc Score is : {round(lrScore * 100 ,2)}%')

print(f'\nNaive Bayes Test Roc Auc Score is : {round(nbScore * 100 , 2)}%')

print(f'\nDecision Tree Test Roc Auc Score is : {round(dtScore * 100 , 2)}%')

print(f'\nRandom Forest Test Roc Auc Score is : {round(rfScore * 100 , 2)}%')

In [None]:
def makeCM_Matrix(cm_matrix, title):

  categories = ['Negative','Positive']

  group_names = ['True Negative','False Positive', 'False Negative','True Positive']

  group_percentages = ['{0:.2%}'.format(value) for value in cm_matrix.flatten() / np.sum(cm_matrix)]

  labels = [f'{v1}\n{v2} ({v3})' for v1, v2 ,v3 in zip(group_names,cm_matrix.flatten(),group_percentages)]

  labels = np.asarray(labels).reshape(2,2)

  sns.heatmap(cm_matrix, annot = labels,fmt = '',
  xticklabels = categories, yticklabels = categories)

  plt.xlabel("Predicted values", fontdict = {'size':14}, labelpad = 10)
  plt.ylabel("Actual values" , fontdict = {'size':14}, labelpad = 10)
  plt.title (f"{title} Confusion Matrix", fontdict = {'size':18}, pad = 20)
  plt.show()

In [None]:
import seaborn as sns
makeCM_Matrix(confusion_matrix(y_test, lrPreds), 'Logistic Regression')

In [None]:
makeCM_Matrix(confusion_matrix(y_test, nbPreds), 'Naive Bayes')


In [None]:
makeCM_Matrix(confusion_matrix(y_test, dtPreds), 'Decision Tree')


In [None]:
makeCM_Matrix(confusion_matrix(y_test, rfPreds), 'Random Forest')


In [None]:
from sklearn.metrics import classification_report

classification_report = classification_report(y_test, lrPreds, target_names=['No','Yes'])
print('Logistic Regression Classification Report: \n',classification_report)

In [None]:
from sklearn.metrics import classification_report

classification_report = classification_report(y_test, nbPreds, target_names=['No','Yes'])
print('Naive Bayes Classification Report: \n', classification_report)

In [None]:
from sklearn.metrics import classification_report

classification_report = classification_report(y_test, dtPreds, target_names=['No','Yes'])
print('Decision Tree Classification Report: \n',classification_report)

In [None]:
from sklearn.metrics import classification_report

classification_report = classification_report(y_test, rfPreds, target_names=['No','Yes'])
print('Random Forest Classification Report: \n',classification_report)

In [1]:
!jupyter nbconvert --to script config_template.ipynb

This application is used to convert notebook files (*.ipynb)
        to various other formats.


Options
The options below are convenience aliases to configurable class-options,
as listed in the "Equivalent to" description-line of the aliases.
To see all configurable class-options for some <cmd>, use:
    <cmd> --help-all

--debug
    set log level to logging.DEBUG (maximize logging output)
    Equivalent to: [--Application.log_level=10]
--show-config
    Show the application's configuration (human-readable format)
    Equivalent to: [--Application.show_config=True]
--show-config-json
    Show the application's configuration (json format)
    Equivalent to: [--Application.show_config_json=True]
--generate-config
    generate default config file
    Equivalent to: [--JupyterApp.generate_config=True]
-y
    Answer yes to any questions instead of prompting.
    Equivalent to: [--JupyterApp.answer_yes=True]
--execute
    Execute the notebook prior to export.
    Equivalent to: [--ExecutePr