# Hiring Case Study: Task 2
#### first things first: import packages

In [264]:
import pandas as pd
import pyarrow
import numpy as np
from sklearn.preprocessing import LabelEncoder
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn import model_selection, svm
from sklearn.metrics import accuracy_score

#### set random seed

In [265]:
np.random.seed(500)

#### load and inspect data

In [266]:
df_forum = pd.read_parquet(r'psoriasis_all_posts.parquet', engine='pyarrow').reset_index(drop=True)
df_categories = pd.read_excel(r'keywords_psoriasis_therapies.xlsx')
print(df_categories.head())
print(df_forum.head())

                    keyword                     category
0                Apremilast  Andere systemische Therapie
1  Phosphodiesterase-Hemmer  Andere systemische Therapie
2                    Otezla  Andere systemische Therapie
3            PDE4-Inhibitor  Andere systemische Therapie
4               Ciclosporin  Andere systemische Therapie
   post_id                                               text       user  \
0   170079   zunehmend oft höre ich über die problematik d...  Annamaria   
1   170082   Hallo Annamaria,  herzlich Willkommen hier im...     Fischi   
2   170210   hallo, Annamaria -  ich wünsche dir wirklich ...       Bibi   
3   170251   Auch ich wünsche euch ein schönes und glückli...  BlackLady   
4   170019   Ich wünsche Allen   Frohes Neues Jahr GB Pics...   Bernd-57   

            timestamp  subject_id       forum_id  
0 2010-01-01 21:48:37       13338    4-community  
1 2010-01-01 21:48:37       13338    4-community  
2 2010-01-01 21:48:37       13338    4-community

#### Preprocess

In [267]:
# Remove blank rows if any.
df_forum['text'].dropna(inplace=True)
# Change all the text to lower case. 
df_forum['text'] = [entry.lower() for entry in df_forum['text']]
# Split into Train and Test data
Train_X, Test_X, Train_Y, Test_Y = model_selection.train_test_split(df_forum['text'],df_forum['forum_id'],test_size=0.3)
Encoder = LabelEncoder()
# Make Labels
Train_Y = Encoder.fit_transform(Train_Y)
Test_Y = Encoder.fit_transform(Test_Y)
# Word Vectorization
Tfidf_vect = TfidfVectorizer(max_features=100)
Tfidf_vect.fit(df_forum['text'])
Train_X_Tfidf = Tfidf_vect.transform(Train_X)
Test_X_Tfidf = Tfidf_vect.transform(Test_X)
print(Tfidf_vect.vocabulary_)

{'ich': 47, 'über': 99, 'die': 20, 'der': 18, 'auch': 6, 'von': 86, 'und': 84, 'so': 82, 'an': 5, 'in': 51, 'den': 17, 'mich': 62, 'zu': 97, 'bin': 10, 'auf': 7, 'meine': 61, 'haut': 45, 'mit': 64, 'im': 49, 'habe': 40, 'einen': 29, 'viel': 85, 'quote': 74, 'hallo': 42, 'hier': 46, 'dich': 19, 'ein': 26, 'wenn': 90, 'du': 24, 'nur': 70, 'dir': 22, 'es': 32, 'werden': 91, 'dann': 13, 'dass': 15, 'alles': 1, 'für': 34, 'das': 14, 'dem': 16, 'was': 89, 'noch': 68, 'gibt': 36, 'also': 3, 'haben': 41, 'muss': 65, 'sich': 79, 'schon': 75, 'eine': 27, 'psoriasis': 73, 'sein': 77, 'hab': 39, 'diese': 21, 'kann': 55, 'einem': 28, 'um': 83, 'da': 12, 'mal': 57, 'immer': 50, 'grüße': 37, 'mir': 63, 'ihr': 48, 'als': 2, 'bei': 9, 'pso': 72, 'vor': 87, 'wurde': 96, 'einer': 30, 'sind': 81, 'nach': 66, 'aber': 0, 'war': 88, 'jetzt': 54, 'doch': 23, 'wieder': 93, 'sehr': 76, 'nicht': 67, 'wie': 92, 'ganz': 35, 'oder': 71, 'hatte': 44, 'etwas': 33, 'mehr': 59, 'ist': 52, 'sie': 80, 'keine': 56, 'gut':

#### Classify Posts into Forums with Support Vector Machine

In [271]:
SVM = svm.SVC(C=1.0, kernel='linear', gamma='scale')
SVM.fit(Train_X_Tfidf,Train_Y)
# predict the labels on validation dataset
predictions_SVM = SVM.predict(Test_X_Tfidf)
# Use accuracy_score function to get the accuracy
print("SVM Accuracy Score -> ",accuracy_score(predictions_SVM, Test_Y)*100)

SVM Accuracy Score ->  45.40466392318244
