<a href="https://colab.research.google.com/github/kritikaparmar-programmer/ML_Projects/blob/main/Reddit_NLP.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
# Data manipulation
import numpy as np
import pandas as pd

# Data visualisation
import matplotlib.pyplot as plt
import seaborn as sns

# Statistical libraries
from sklearn.feature_selection import chi2

# NLP
import nltk
import re
from sklearn.feature_extraction.text import CountVectorizer

# Ml
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.naive_bayes import MultinomialNB
from sklearn.pipeline import Pipeline

# Performance Evaluation and Support
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score

  import pandas.util.testing as tm


In [None]:
data = pd.read_csv('/content/sample_data/data_reddit_india.csv')

In [None]:
# Data Shuffling
data.drop(['Unnamed: 0'], inplace=True, axis=1)
data[:] = data.sample(frac=1).values
data.head()

Unnamed: 0,Title,Score,ID,URL,num_comments,created_on,Body,Original,Flair,Comments
0,I finally convinced him. That awesome moment w...,1892,d0em6s,https://www.reddit.com/r/india/comments/d0em6s...,184,1567793000.0,A little long story. I am going through a lot...,False,[R]eddiquette,Larke ko ias officer banao....kudos to you op.
1,Mumbai's high Covid count due to aggressive te...,258,g5wv53,https://theprint.in/theprint-otc/mumbais-high-...,37,1587570000.0,,False,Policy/Economy,> high Covid count due to aggressive testing\n...
2,"@KeralaTourism: Tender chunks of beef, slow-ro...",385,ep32la,https://twitter.com/KeralaTourism/status/12174...,63,1579128000.0,,False,Food,[removed]
3,Music from the string quartet that raised the ...,17,g1l3p3,https://www-thehindu-com.cdn.ampproject.org/v/...,0,1586954000.0,,False,Sports,
4,"I am Ashish K. Mishra, I write stories. AMA",76,76xx01,https://www.reddit.com/r/india/comments/76xx01...,212,1508269000.0,I am the Managing Editor of The Ken. We write ...,False,AMA,Would you rather reverse one decision you make...


In [None]:
# Display data types and null values
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1650 entries, 0 to 1649
Data columns (total 10 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   Title         1650 non-null   object 
 1   Score         1650 non-null   int64  
 2   ID            1650 non-null   object 
 3   URL           1650 non-null   object 
 4   num_comments  1650 non-null   int64  
 5   created_on    1650 non-null   float64
 6   Body          635 non-null    object 
 7   Original      1650 non-null   bool   
 8   Flair         1650 non-null   object 
 9   Comments      1557 non-null   object 
dtypes: bool(1), float64(1), int64(2), object(6)
memory usage: 117.8+ KB


In [None]:
print(len(data['Flair'].unique()))
data['Flair'].unique()

11


array(['[R]eddiquette', 'Policy/Economy', 'Food', 'Sports', 'AMA',
       'Business/Finance', 'Photography', 'Science/Technology',
       'AskIndia', 'Non-Political', 'Politics'], dtype=object)

In [None]:
# List of relevant features
features = ['Flair', 'URL', 'Title', 'Comments', 'Body']
data = data[features]
data.head()

Unnamed: 0,Flair,URL,Title,Comments,Body
0,[R]eddiquette,https://www.reddit.com/r/india/comments/d0em6s...,I finally convinced him. That awesome moment w...,Larke ko ias officer banao....kudos to you op.,A little long story. I am going through a lot...
1,Policy/Economy,https://theprint.in/theprint-otc/mumbais-high-...,Mumbai's high Covid count due to aggressive te...,> high Covid count due to aggressive testing\n...,
2,Food,https://twitter.com/KeralaTourism/status/12174...,"@KeralaTourism: Tender chunks of beef, slow-ro...",[removed],
3,Sports,https://www-thehindu-com.cdn.ampproject.org/v/...,Music from the string quartet that raised the ...,,
4,AMA,https://www.reddit.com/r/india/comments/76xx01...,"I am Ashish K. Mishra, I write stories. AMA",Would you rather reverse one decision you make...,I am the Managing Editor of The Ken. We write ...


In [None]:
# Assigning and individual id to each flair
data['id'] = data['Flair'].factorize()[0]
flair_category = data[['Flair', 'id']].drop_duplicates().sort_values('id')
flair_category

Unnamed: 0,Flair,id
0,[R]eddiquette,0
1,Policy/Economy,1
2,Food,2
3,Sports,3
4,AMA,4
6,Business/Finance,5
7,Photography,6
12,Science/Technology,7
20,AskIndia,8
23,Non-Political,9


In [None]:
# Convert into a label dictionary 
category_labels = dict(flair_category.values)
print(category_labels)

print("======="*15) # Line break display

# Similarly, we can create an inverse of the previous one to convert labels to categories
category_reverse = dict(flair_category[['id', 'Flair']].values)
print(category_reverse)

{'[R]eddiquette': 0, 'Policy/Economy': 1, 'Food': 2, 'Sports': 3, 'AMA': 4, 'Business/Finance': 5, 'Photography': 6, 'Science/Technology': 7, 'AskIndia': 8, 'Non-Political': 9, 'Politics': 10}
{0: '[R]eddiquette', 1: 'Policy/Economy', 2: 'Food', 3: 'Sports', 4: 'AMA', 5: 'Business/Finance', 6: 'Photography', 7: 'Science/Technology', 8: 'AskIndia', 9: 'Non-Political', 10: 'Politics'}


In [None]:
data['Combine'] = data['Title'] # Create a column combined
count = 0
for i in range(len(data)):
    if type(data.loc[i]['Body']) != float:
        data['Combine'][i] = data['Combine'][i] + ' ' + data['Body'][i]

    if type(data.loc[i]['Comments']) != float:
        data['Combine'][i] = data['Combine'][i] + ' ' + data['Comments'][i]

data.head(20)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  


Unnamed: 0,Flair,URL,Title,Comments,Body,id,Combine
0,[R]eddiquette,https://www.reddit.com/r/india/comments/d0em6s...,I finally convinced him. That awesome moment w...,Larke ko ias officer banao....kudos to you op.,A little long story. I am going through a lot...,0,I finally convinced him. That awesome moment w...
1,Policy/Economy,https://theprint.in/theprint-otc/mumbais-high-...,Mumbai's high Covid count due to aggressive te...,> high Covid count due to aggressive testing\n...,,1,Mumbai's high Covid count due to aggressive te...
2,Food,https://twitter.com/KeralaTourism/status/12174...,"@KeralaTourism: Tender chunks of beef, slow-ro...",[removed],,2,"@KeralaTourism: Tender chunks of beef, slow-ro..."
3,Sports,https://www-thehindu-com.cdn.ampproject.org/v/...,Music from the string quartet that raised the ...,,,3,Music from the string quartet that raised the ...
4,AMA,https://www.reddit.com/r/india/comments/76xx01...,"I am Ashish K. Mishra, I write stories. AMA",Would you rather reverse one decision you make...,I am the Managing Editor of The Ken. We write ...,4,"I am Ashish K. Mishra, I write stories. AMA I ..."
5,Sports,https://www.reddit.com/r/india/comments/f5pxap...,Need help related to my research project on Sp...,Filled it,"Hey guys,\n\nI am working on a research projec...",3,Need help related to my research project on Sp...
6,Business/Finance,https://www.reddit.com/r/india/comments/g0zm53...,The Current scenario for IT professionals (Cov...,"I work for an OTT client, so their business is...","Hey guys, i wanted to know what's the scenario...",5,The Current scenario for IT professionals (Cov...
7,Photography,https://i.redd.it/zrdg4z3wbjr31.jpg,Kolkata at night during Durga Pujo,Fuck that pollution looks like hell.,,6,Kolkata at night during Durga Pujo Fuck that p...
8,Photography,https://i.redd.it/rphk0n2proa41.jpg,"Everybody's posting pictures of my hometown, s...",And here i am waiting for vikendi update,,6,"Everybody's posting pictures of my hometown, s..."
9,AMA,https://www.reddit.com/r/india/comments/35eabh...,Hello Reddit! Kaneez Surka here. Ask Me Anythi...,"Unlike Aditi Mittal, can we find hot pics of y...",I am a comedian and improviser. Currently part...,4,Hello Reddit! Kaneez Surka here. Ask Me Anythi...


In [None]:
import nltk
nltk.download('stopwords')

# Collect all the english stopwords and display them
STOPWORDS = nltk.corpus.stopwords.words('english')
print(STOPWORDS)

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
['i', 'me', 'my', 'myself', 'we', 'our', 'ours', 'ourselves', 'you', "you're", "you've", "you'll", "you'd", 'your', 'yours', 'yourself', 'yourselves', 'he', 'him', 'his', 'himself', 'she', "she's", 'her', 'hers', 'herself', 'it', "it's", 'its', 'itself', 'they', 'them', 'their', 'theirs', 'themselves', 'what', 'which', 'who', 'whom', 'this', 'that', "that'll", 'these', 'those', 'am', 'is', 'are', 'was', 'were', 'be', 'been', 'being', 'have', 'has', 'had', 'having', 'do', 'does', 'did', 'doing', 'a', 'an', 'the', 'and', 'but', 'if', 'or', 'because', 'as', 'until', 'while', 'of', 'at', 'by', 'for', 'with', 'about', 'against', 'between', 'into', 'through', 'during', 'before', 'after', 'above', 'below', 'to', 'from', 'up', 'down', 'in', 'out', 'on', 'off', 'over', 'under', 'again', 'further', 'then', 'once', 'here', 'there', 'when', 'where', 'why', 'how', 'all', 'any', 'both', 'ea

In [None]:
REPLACE_SPACES = re.compile('[/(){}\[\]\|@,;]')
BAD_SYMBOLS = re.compile('[^0-9a-z #+_]')

def clean_text(text):
    '''
        text: a string
        return: modified initial string
    '''

    text = text.lower() # lowercase text
    text = REPLACE_SPACES.sub(' ', text) 
    text = BAD_SYMBOLS.sub('', text) # Replace Bad Symbols which 
    text = text.replace('x', '')
    
    text = ' '.join(word for word in text.split() if word not in STOPWORDS) # remove stopwors from text
    return text

data['Combine'] = data['Combine'].apply(clean_text)
data['Combine'] = data['Combine'].str.replace('\d+', '')
data['Combine'].head(10)

0    finally convinced awesome moment understand es...
1    mumbais high covid count due aggressive testin...
2    keralatourism tender chunks beef slowroasted a...
3     music string quartet raised level indian cricket
4    ashish k mishra write stories ama managing edi...
5    need help related research project sports cons...
6    current scenario professionals covid hey guys ...
7    kolkata night durga pujo fuck pollution looks ...
8    everybodys posting pictures hometown figured i...
9    hello reddit kaneez surka ask anything comedia...
Name: Combine, dtype: object

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import TfidfTransformer

In [None]:
# Creating an instance of the Tfidf vectorizer
tfidf = TfidfVectorizer(sublinear_tf=True, 
                        min_df=5, 
                        norm = 'l2', 
                        encoding='latin-1', 
                        ngram_range=(1, 2))


# Extracting the features by fitting the Vectorizer on Combined Data
feat = tfidf.fit_transform(data['Combine']).toarray()
labels = data['id']    # Series containing all the post labels
print(feat.shape)

(1650, 3299)


In [None]:
X_train, X_test, y_train, y_test = train_test_split(data['Combine'], data['Flair'], 
                                                    test_size=0.2, 
                                                    random_state=42)

print(X_train.shape, y_train.shape, X_test.shape, y_test.shape)

(1320,) (1320,) (330,) (330,)


In [None]:
# Creating an instance of the TFID transformer
count_vec = CountVectorizer()
X_train_counts = count_vec.fit_transform(X_train)

# Creating an instance of the TFID transformer
tfidf_trans = TfidfTransformer()
X_train_tfidf = tfidf_trans.fit_transform(X_train_counts)

In [None]:
X_test_counts = count_vec.transform(X_test)
X_test_tfidf = tfidf_trans.transform(X_test_counts)

In [None]:
# Create an instance 
model = MultinomialNB()
# Fit to training data
model.fit(X_train_tfidf, y_train)
# Predictions on X_test_tfidf
# Obtain X_test_tfidf in the manner described above
model.predict(X_test_tfidf)

array(['AMA', 'Policy/Economy', 'Policy/Economy', '[R]eddiquette',
       'Policy/Economy', 'Business/Finance', 'AMA', 'AMA', 'AskIndia',
       'Policy/Economy', 'AskIndia', 'Policy/Economy', 'Food',
       'Photography', 'AskIndia', 'AMA', 'Policy/Economy', 'Photography',
       'AskIndia', 'Sports', 'AskIndia', 'Food', 'AMA', 'AskIndia', 'AMA',
       'Policy/Economy', 'AskIndia', 'AskIndia', 'Sports', 'AMA',
       'AskIndia', 'Policy/Economy', 'AskIndia', 'AskIndia', 'AMA',
       'Food', 'Photography', 'Business/Finance', 'Politics', 'Politics',
       'Politics', 'AMA', 'AMA', 'AskIndia', 'AskIndia', 'Photography',
       'AMA', 'Sports', 'Sports', 'Politics', 'AMA', 'Photography',
       'Sports', 'AskIndia', 'AskIndia', 'Politics', 'Food',
       'Business/Finance', 'Sports', 'AMA', 'AMA', 'AMA', 'Sports',
       'Sports', 'AskIndia', 'AMA', 'AskIndia', 'Non-Political',
       'Business/Finance', 'AskIndia', 'AskIndia', 'AskIndia',
       'Business/Finance', 'AskIndia', 'AskIn

In [None]:
X_train_tfidf.todense()

matrix([[0., 0., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0., 0.],
        ...,
        [0., 0., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0., 0.]])

In [None]:
nb_fit = Pipeline([('vect', CountVectorizer()),
                   ('tfidf', TfidfTransformer()),
                   ('clf', MultinomialNB())])

In [None]:
# Naive Bayes Classifier 
def nb_classifier(X_train, X_test, y_train, y_test):
    
    nb_fit = Pipeline([('vect', CountVectorizer()),
                  ('tfidf', TfidfTransformer()),
                  ('model', MultinomialNB()),
                 ])
    nb_fit.fit(X_train, y_train)    # Fitting the data to the trianing data
    
    # Making Predictions on the test data
    y_pred = nb_fit.predict(X_test)
    acc = accuracy_score(y_pred=y_pred, y_true=y_test)
    print("Model Accuracy: {}".format(acc))
    

In [None]:
# Random Forest Classifier
def random_forest(X_train, X_test, y_train, y_test):
    
    forest = Pipeline([('vect', CountVectorizer()),
                  ('tfidf', TfidfTransformer()),
                  ('model', RandomForestClassifier()),
                 ])
    forest.fit(X_train, y_train)    # Fitting the data to the trianing data
    
    # Making Predictions on the test data
    y_pred = forest.predict(X_test)
    acc = accuracy_score(y_pred=y_pred, y_true=y_test)
    print("Model Accuracy: {}".format(acc))

In [None]:
# Support Vector Machines Classifier 
def svc(X_train, X_test, y_train, y_test):
    
    svc_fit = Pipeline([('vect', CountVectorizer()),
                  ('tfidf', TfidfTransformer()),
                  ('model', SVC()),
                 ])
    svc_fit.fit(X_train, y_train)    # Fitting the data to the trianing data
    
    # Making Predictions on the test data
    y_pred = svc_fit.predict(X_test)
    acc = accuracy_score(y_pred=y_pred, y_true=y_test)
    print("Model Accuracy: {}".format(acc))

In [None]:
# Logistic Regression Classifier 
def log_reg(X_train, X_test, y_train, y_test):
    
    logreg = Pipeline([('vect', CountVectorizer()),
                  ('tfidf', TfidfTransformer()),
                  ('model', LogisticRegression()),
                 ])
    logreg.fit(X_train, y_train)     # Fitting the data to the trianing data

    # Making Predictions on the test data
    y_pred = logreg.predict(X_test)
    acc = accuracy_score(y_pred=y_pred, y_true=y_test)
    print("Model Accuracy: {}".format(acc))

In [None]:
from sklearn.metrics import accuracy_score

In [None]:
print("Evaluate Naive Bayes Classifier")
nb_classifier(X_train, X_test, y_train, y_test)


Evaluate Naive Bayes Classifier
Model Accuracy: 0.49393939393939396


In [None]:
print("Evaluate Random Forest Classifier")
random_forest(X_train, X_test, y_train, y_test)

Evaluate Random Forest Classifier
Model Accuracy: 0.5151515151515151


In [None]:
print("Evaluate Logistic Regression Model")
log_reg(X_train, X_test, y_train, y_test)


Evaluate Logistic Regression Model
Model Accuracy: 0.5848484848484848


In [None]:
print("Evaluate SVC Model")
svc(X_train, X_test, y_train, y_test)

Evaluate SVC Model
Model Accuracy: 0.5484848484848485
