![alt text](https://user-images.githubusercontent.com/37707687/62003677-4119b700-b138-11e9-89ad-60725dc3f6f8.png)

## Data Read

In [0]:
import io
import pandas as pd
import numpy as np
df = pd.read_csv("https://datahack-prod.s3.amazonaws.com/train_file/train_F3WbcTw.csv",header=0)

## Data Cleaning

In [0]:
import re 
def clean_str(string):
    """
    Tokenization/string cleaning for dataset
    Every dataset is lower cased except
    """
    #querywords = string.split()
    #resultwords  = [word for word in querywords if word.lower() not in stop]
    #string = ' '.join(resultwords)
    string = re.sub(r"\’", "\'", string)
    string = re.sub(r"won\'t", "will not", string)
    string = re.sub(r"can\'t", "can not", string)
    string = re.sub(r"don\'t", "do not", string)
    string = re.sub(r"dont", "do not", string)
    string = re.sub(r"n\’t", " not", string)
    string = re.sub(r"n\'t", " not", string)
    string = re.sub(r"\'re", " are", string)
    string = re.sub(r"\'s", " is", string)
    string = re.sub(r"\’d", " would", string)
    string = re.sub(r"\d", " would", string)
    string = re.sub(r"\'ll", " will", string)
    string = re.sub(r"\'t", " not", string)
    string = re.sub(r"\'ve", " have", string)
    string = re.sub(r"\'m", " am", string) 
    string = re.sub(r"\n", "", string)    
    string = re.sub(r"\r", "", string) 
    string = re.sub(r"[0-9]", "digit", string)
    string = re.sub(r"\'", "", string)    
    string = re.sub(r"\"", "", string)    
    string = re.sub(r'[?|!|\'|"|#]',r'',string)
    string = re.sub(r'[.|,|)|(|\|/]',r' ',string)
    return string.strip().lower()
X = []
stop = list(set(df["drug"]))
stop = map(lambda x:x.lower(),stop)
for i in range(df.shape[0]):
    X.append(clean_str(df.iloc[i][1]))
y = np.array(df["sentiment"])

In [15]:
X[1]

'i can completely understand why youd want to try it  but  results reported in lectures do not always stand up to the scrutiny of peer-review during publication  there so much still to do before this is convincing  i hope that it does work out  i really do  and if you are aware of and happy with the risks  then that is great  i just think it is important to present this in a balanced way  and to understand why we do not move straight from the first show of promise in an animal study to using drugs on humans  there is still a lot of animal data to gather  and human data to gather before anyone can tell if it is safe or effective  i can not tell you how many times animal studies do not follow through to humans  but it is one of the major attrition points in drug development  you have been through some of the unpredictability issues with cladribine gilenya  where there was an interaction that was not predicted  but once people try it  the doctors can see patterns and work out what is goin

## Model Pipeline

In [0]:
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.svm import LinearSVC
from sklearn.multiclass import OneVsRestClassifier
from sklearn.ensemble import RandomForestClassifier
model = Pipeline([('vectorizer', CountVectorizer(stop_words='english')),
 ('tfidf', TfidfTransformer()),
 ('clf', OneVsRestClassifier(LinearSVC(class_weight="balanced")))])



## Parameter Tuning

In [17]:
from sklearn.model_selection import GridSearchCV

#from sklearn.grid_search import GridSearchCV
parameters = { 'vectorizer__ngram_range': [(1, 1), (1, 2),(2,2)],
               'tfidf__use_idf': (True, False)         
            }

gs_clf_svm = GridSearchCV(model, parameters,scoring="f1_macro", n_jobs=-1,verbose=2)
gs_clf_svm = gs_clf_svm.fit(X, y)
print(gs_clf_svm.best_score_)
print(gs_clf_svm.best_params_)

Fitting 3 folds for each of 6 candidates, totalling 18 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 2 concurrent workers.
[Parallel(n_jobs=-1)]: Done  18 out of  18 | elapsed:   48.1s finished


0.4883059513911091
{'tfidf__use_idf': False, 'vectorizer__ngram_range': (1, 2)}


## Data Partiton

In [0]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20, random_state=42)

In [0]:
model = Pipeline([('vectorizer', CountVectorizer(stop_words='english',ngram_range=(1,2))),
    ('tfidf', TfidfTransformer(use_idf=False)),
    ('clf', OneVsRestClassifier(LinearSVC(class_weight="balanced",random_state=123456987)))])

## Modeling with Best Parameter

In [0]:

model.fit(X_train, y_train)
#evaluation on test data
pred = model.predict(X_test)


## Result Validation

In [21]:
from sklearn.metrics import confusion_matrix, accuracy_score,f1_score
print(f1_score(pred, y_test,average="macro"))

0.511245180504296


In [22]:
model.fit(X, y)

Pipeline(memory=None,
         steps=[('vectorizer',
                 CountVectorizer(analyzer='word', binary=False,
                                 decode_error='strict',
                                 dtype=<class 'numpy.int64'>, encoding='utf-8',
                                 input='content', lowercase=True, max_df=1.0,
                                 max_features=None, min_df=1,
                                 ngram_range=(1, 2), preprocessor=None,
                                 stop_words='english', strip_accents=None,
                                 token_pattern='(?u)\\b\\w\\w+\\b',
                                 tokenizer=None, voc...
                ('tfidf',
                 TfidfTransformer(norm='l2', smooth_idf=True,
                                  sublinear_tf=False, use_idf=False)),
                ('clf',
                 OneVsRestClassifier(estimator=LinearSVC(C=1.0,
                                                         class_weight='balanced',
       

## Test Data Prep

In [0]:
ans = []
test = pd.read_csv("https://datahack-prod.s3.amazonaws.com/test_file/test_tOlRoBf.csv",header = 0)
for i in range(test.shape[0]):
    ans.append(clean_str(test.iloc[i][1])+' '+ test.iloc[i][2])

## Prediction

In [0]:
pred = model.predict(ans)

In [0]:
from collections import Counter
print(Counter(a))

Counter({2: 2427, 1: 293, 0: 204})


In [0]:
test.columns

Index(['unique_hash', 'text', 'drug'], dtype='object')

In [0]:
sub =pd.DataFrame()
sub["unique_hash"] = test["unique_hash"]
sub["sentiment"] = a

In [0]:
sub.head()

Unnamed: 0,unique_hash,sentiment
0,9e9a8166b84114aca147bf409f6f956635034c08,2
1,e747e6822c867571afe7b907b51f0f2ca67b0e1a,1
2,50b6d851bcff4f35afe354937949e9948975adf7,2
3,7f82ec2176ae6ab0b5d20b5ffc767ac829f384ae,2
4,8b37d169dee5bdae27060949242fb54feb6a7f7f,2


## Final Result Submission

In [0]:
sub.to_csv("result_tf2.csv",index = False)