In [1]:
import pandas as pd
import re
import pickle
from scipy.sparse import hstack
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
nltk.download('stopwords')
nltk.download('punkt')
stop_words=stopwords.words('english')
lemmatizer=WordNetLemmatizer()

[nltk_data] Downloading package stopwords to C:\Users\VS
[nltk_data]     Chaitanya\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to C:\Users\VS
[nltk_data]     Chaitanya\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [2]:
def decontracted(phrase):
    # specific
    phrase = re.sub(r"’","'",phrase)
    phrase = re.sub(r"”",'"',phrase)
    phrase = re.sub(r"“",'"',phrase)
    phrase = re.sub(r"won't", "will not", phrase)
    phrase = re.sub(r"can\'t", "can not", phrase)

    # general
    phrase = re.sub(r"n\'t", " not", phrase)
    phrase = re.sub(r"\'re", " are", phrase)
    phrase = re.sub(r"\'s", "s", phrase)
    phrase = re.sub(r"\'d", " would", phrase)
    phrase = re.sub(r"\'ll", " will", phrase)
    phrase = re.sub(r"\'t", " not", phrase)
    phrase = re.sub(r"\'ve", " have", phrase)
    phrase = re.sub(r"\'m", " am", phrase)
    return phrase

In [3]:
def final(file):
    print("Reading Data...")
    data=pd.read_csv(file)
    
    #dropping the column 'id'
    data.drop(columns={'id'},inplace=True)
    
    print("Data Cleaning...")
    #dropping duplicate rows
    data.drop_duplicates(inplace=True)
    
    data['title'].fillna(' ',inplace=True)
    data['text'].fillna(' ',inplace=True)
    data['author'].fillna('missing',inplace=True)
    
    print("Data Preprocessing...")
    preprocessed_titles = []
    for sentance in (data['title'].values):
        sent = decontracted(sentance)
        sent=re.sub(r'https?:\/\/.*[\r\n]*', '', sent) # remove hyperlinks
        sent = re.sub('[^A-Za-z]+', ' ', sent) #remove spacial character, numbers: https://stackoverflow.com/a/5843547/4084039
        sent = ' '.join(e for e in sent.split() if e not in stop_words) #removing stop words
        sent=' '.join(lemmatizer.lemmatize(e) for e in sent.split()) #lemmatization
        preprocessed_titles.append(sent.lower().strip())
    data['title']=preprocessed_titles
    
    preprocessed_texts = []
    for sentance in (data['text'].values):
        sent = decontracted(sentance)
        sent=re.sub(r'https?:\/\/.*[\r\n]*', '', sent) # remove hyperlinks
        sent = re.sub('[^A-Za-z]+', ' ', sent) #remove spacial characters, numbers: https://stackoverflow.com/a/5843547/4084039
        sent = ' '.join(e for e in sent.split() if e not in stop_words) #removing stop words
        sent=' '.join(lemmatizer.lemmatize(e) for e in sent.split()) #lemmatization
        preprocessed_texts.append(sent.lower().strip())
    data['text']=preprocessed_texts
    
    print("Data Encoding...")
    with open("title_tfidf_vectorizer.pickle","rb") as fp:
        title_tfidf_vectorizer=pickle.load(fp)
    title_tfidf=title_tfidf_vectorizer.transform(data['title'].values)
    
    with open("prob_dict.pickle","rb") as fp:
        prob_dict=pickle.load(fp)
    
    keys=prob_dict.keys()
    author_response_code=[]
    for author in data['author']:
        if author not in keys:
            author_response_code.append([0.5,0.5])
        else:
            author_response_code.append(prob_dict.get(author))
    
    with open("text_tfidf_vectorizer.pickle","rb") as fp:
        text_tfidf_vectorizer=pickle.load(fp)
    text_tfidf=text_tfidf_vectorizer.transform(data['text'].values)
    
    data_final_tfidf=hstack((title_tfidf,author_response_code,text_tfidf))
    
    print("Loading best model and predicting the output labels...")
    with open("nb_clf_best.pickle","rb") as fp:
        nb_clf_best=pickle.load(fp)
    scores=nb_clf_best.predict(data_final_tfidf)
    print("Done.")
    
    print("Storing predicted labels in labels column...")
    data_with_labels=data.copy()
    data_with_labels['label']=scores
    data_with_labels.to_csv("test_data_with_predicted_labels.csv",index=False)
    print("Check 'test_data_with_predicted_labels.csv' for output.")
    
    return scores

In [4]:
file="test.csv"
predicted_labels=final(file)
print(predicted_labels[:10])

Reading Data...
Data Cleaning...
Data Preprocessing...
Data Encoding...
Loading best model and predicting the output labels...
Done.
Storing predicted labels in labels column...
Check 'test_data_with_predicted_labels.csv' for output.
[0 1 1 0 1 1 0 0 1 0]
