<a href="https://colab.research.google.com/github/harshita23sharma/Multi_Class_Text_Classification-/blob/master/Multi_class_v1.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [0]:
# Mounting google drive for train images and groundtruth
from google.colab import drive
drive.mount('/content/drive')

In [0]:
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
%matplotlib inline
import pandas as pd
import numpy as np
import nltk
from nltk.corpus import stopwords
import gensim
from gensim.models import LdaModel
from gensim import models, corpora, similarities
import re
from nltk.stem.porter import PorterStemmer
import time
from nltk import FreqDist
from scipy.stats import entropy
import matplotlib.pyplot as plt
import seaborn as sns
sns.set_style("darkgrid")

In [0]:
import pandas as pd
train = pd.read_csv("/content/drive/My Drive/zs/data/train.csv") 
test = pd.read_csv("/content/drive/My Drive/zs/data/test.csv")

In [0]:
import nltk
nltk.download("popular")

In [0]:
def initial_clean(text):
    """
    Function to clean text of websites, email addresess and any punctuation
    We also lower case the text
    """
    text = re.sub("((\S+)?(http(s)?)(\S+))|((\S+)?(www)(\S+))|((\S+)?(\@)(\S+)?)", " ", text)
    text = re.sub("[^a-zA-Z ]", "", text)
    text = text.lower() # lower case the text
    text = nltk.word_tokenize(text)
    return text

stop_words = stopwords.words('english')
def remove_stop_words(text):
    """
    Function that removes all stopwords from text
    """
    return [word for word in text if word not in stop_words]

stemmer = PorterStemmer()
def stem_words(text):
    """
    Function to stem words, so plural and singular are treated the same
    """
    try:
        text = [stemmer.stem(word) for word in text]
        text = [word for word in text if len(word) > 1] # make sure we have no 1 letter words
    except IndexError: # the word "oed" broke this, so needed try except
        pass
    return text

def apply_all(text):
    """
    This function applies all the functions above into one
    """
    return stem_words(remove_stop_words(initial_clean(text))) 

In [0]:
def standardize_text(df, text_field):
    df[text_field] = df[text_field].str.replace(r"http\S+", "")
    df[text_field] = df[text_field].str.replace(r"http", "")
    df[text_field] = df[text_field].str.replace(r"@\S+", "")
    df[text_field] = df[text_field].str.replace(r"[^A-Za-z0-9(),!?@\'\`\"\_\n]", " ")
    df[text_field] = df[text_field].str.replace(r"@", "at")
    df[text_field] = df[text_field].str.lower()
    return df

In [0]:
train["clean_text"]=train.job_description
train = standardize_text(train, "clean_text")

In [0]:
X = train[['job_no','job_description','clean_text']]
y = train['category']


In [12]:
X.head()

Unnamed: 0,job_no,job_description,clean_text
0,Id-12765,Zest Scientific is searching for an accomplis...,zest scientific is searching for an accomplis...
1,Id-22925,"En el mundo de los CRO's típicos, esta compañí...","en el mundo de los cro's t picos, esta compa ..."
2,Id-1321,Asha Mistry of Umbilical Life is recruiting an...,asha mistry of umbilical life is recruiting an...
3,Id-9473,Sales Representative - Laser in der Medizinte...,sales representative laser in der medizinte...
4,Id-14952,Field Service Engineer - Life Science\r\r\r\nC...,field service engineer life science \ncamb...


In [0]:
from sklearn.model_selection import train_test_split
X_train,X_test,y_train,y_test = train_test_split(train.clean_text, y, test_size=0.3, random_state=42)

In [0]:
from sklearn.pipeline import Pipeline
from sklearn.svm import LinearSVC

In [0]:
# Initialize the TFIDF vectorizer, that uses bigrams (2,2).
vectorizer = TfidfVectorizer(ngram_range=(2, 2), min_df=4, use_idf=True, smooth_idf=True)



In [0]:
text_clf = Pipeline([('tfidf',vectorizer),('clf',LinearSVC())])

In [17]:
text_clf.fit(X_train,y_train)

Pipeline(memory=None,
         steps=[('tfidf',
                 TfidfVectorizer(analyzer='word', binary=False,
                                 decode_error='strict',
                                 dtype=<class 'numpy.float64'>,
                                 encoding='utf-8', input='content',
                                 lowercase=True, max_df=1.0, max_features=None,
                                 min_df=4, ngram_range=(2, 2), norm='l2',
                                 preprocessor=None, smooth_idf=True,
                                 stop_words=None, strip_accents=None,
                                 sublinear_tf=False,
                                 token_pattern='(?u)\\b\\w\\w+\\b',
                                 tokenizer=None, use_idf=True,
                                 vocabulary=None)),
                ('clf',
                 LinearSVC(C=1.0, class_weight=None, dual=True,
                           fit_intercept=True, intercept_scaling=1,
               

In [0]:
predictions = text_clf.predict(X_test)

In [0]:
from sklearn.metrics import confusion_matrix,classification_report,accuracy_score


In [20]:
print(confusion_matrix(y_test,predictions))

[[ 426   60   88   23   19   33   38    0   26   69  188]
 [  45  139   21    0    0    0    2    0    1    1   29]
 [  78   24  243    3    4   10   96    1   61   19  120]
 [   9    0    0  112    0    0    0    0    0   10   14]
 [   7    2    5    0  103    0   14    0    1    5   23]
 [  36    0   14    0    2  200   44    0    0   22   14]
 [  15    2   47    1    8   35 1227    1    0    2   27]
 [   0    0    3    0    0    0    0    2    0    0    0]
 [  17    3   51    0    0    0    1    0  115    3   15]
 [  67    2   45    9    1   15    4    0   11  132   59]
 [ 231   20  139   11   16   19   76    0   25   48  289]]


In [21]:
print(classification_report(y_test,predictions))

                                              precision    recall  f1-score   support

                           Clinical Research       0.46      0.44      0.45       970
              Data Management and Statistics       0.55      0.58      0.57       238
                  Manufacturing & Operations       0.37      0.37      0.37       659
  Medical Affairs / Pharmaceutical Physician       0.70      0.77      0.74       145
   Medical Information and Pharmacovigilance       0.67      0.64      0.66       160
                    Pharmaceutical Marketing       0.64      0.60      0.62       332
Pharmaceutical, Healthcare and Medical Sales       0.82      0.90      0.86      1365
                                    Pharmacy       0.50      0.40      0.44         5
                           Quality-assurance       0.48      0.56      0.52       205
                          Regulatory Affairs       0.42      0.38      0.40       345
                                     Science       0.