In [5]:
import fasttext
import pandas as pd
import re
from nltk.corpus import stopwords

#### <b>1. Prepare Data For Training Word Vectors</b>

In [None]:
# Read data from files 
train = pd.read_csv("../data/train.csv", header=0)
test = pd.read_csv( "../data/test.csv", header=0)

# Verify the number of comments that were read
print("Read %d labeled train reviews and  %d unlabelled test reviews" % (len(train),len(test)))
all_comments = train['comment_text'].fillna("_na_").tolist() + test['comment_text'].fillna("_na_").tolist() 

with open("data/all_comments.csv", "w+") as comments_file:
    for comment in all_comments:
        comment = re.sub("[^a-zA-Z]"," ",str(comment).lower())
        comment= ' '.join(comment.split()).lstrip()
        comments_file.write("%s\n" % comment)

#### <b> 2. Prepare Data For Training Classifier </b>

In [None]:
# Read data from files 
train = pd.read_csv("../data/train.csv", header=0)
train = pd.read_csv("../data/train.csv", header=0)[20000:len(train)]
validation = pd.read_csv("../data/train.csv", header=0)[0:20000]
test = pd.read_csv( "../data/test.csv", header=0)
# Verify the number of comments that were read
print("Read %d labeled train reviews and  %d unlabelled test reviews" % (len(train),len(test)))
train['comment_text'] = train['comment_text'].fillna("_na_")
validation['comment_text'] = validation['comment_text'].fillna("_na_")
test['comment_text'] = test['comment_text'].fillna("_na_")

with open("data/test_comments.txt", "w+") as comments_file:
    for index,row in test.iterrows():
        comments_file.write(" ")
        comment = re.sub("[^a-zA-Z]"," ",str(row['comment_text'])).lower()
        comment= ' '.join(comment.split()).lstrip()
        comments_file.write("%s\n" % comment)
  
with open("data/train_comments.csv", "w+") as comments_file:
    for index,row in train.iterrows():
        if row['toxic'] == 1:
            comments_file.write("__label__%s " % 'toxic')
        if row['severe_toxic'] == 1:
            comments_file.write("__label__%s " % 'severe_toxic')
        if row['obscene'] == 1:
            comments_file.write("__label__%s " % 'obscene')
        if row['threat'] == 1:
            comments_file.write("__label__%s " % 'threat')
        if row['insult'] == 1:
            comments_file.write("__label__%s " % 'insult')
        if row['identity_hate'] == 1:
            comments_file.write("__label__%s " % 'identity_hate')
        if row['toxic'] + row['severe_toxic'] + row['obscene'] + row['threat'] + row['insult'] + row['identity_hate'] == 0:
            comments_file.write("__label__%s" % 'clean')
        comments_file.write(" ")
        comment = re.sub("[^a-zA-Z]"," ",str(row['comment_text'])).lower()
        comment= ' '.join(comment.split()).lstrip()
        comments_file.write("%s\n" % comment)
        
with open("data/validate_comments.csv", "w+") as comments_file:
    for index,row in validation.iterrows():
        if row['toxic'] == 1:
            comments_file.write("__label__%s " % 'toxic')
        if row['severe_toxic'] == 1:
            comments_file.write("__label__%s " % 'severe_toxic')
        if row['obscene'] == 1:
            comments_file.write("__label__%s " % 'obscene')
        if row['threat'] == 1:
            comments_file.write("__label__%s " % 'threat')
        if row['insult'] == 1:
            comments_file.write("__label__%s " % 'insult')
        if row['identity_hate'] == 1:
            comments_file.write("__label__%s " % 'identity_hate')
        if row['toxic'] + row['severe_toxic'] + row['obscene'] + row['threat'] + row['insult'] + row['identity_hate'] == 0:
            comments_file.write("__label__%s" % 'clean')
        comments_file.write(" ")
        comment = re.sub("[^a-zA-Z]"," ",str(row['comment_text'])).lower()
        comment= ' '.join(comment.split()).lstrip()
        comments_file.write("%s\n" % comment)


#### <b> 3. Prepare Word Vecs, Train Model, Test Predictions </b>

In [None]:
# input_file     training file path (required)
# output         output file path (required)
# lr             learning rate [0.05]
# lr_update_rate change the rate of updates for the learning rate [100]
# dim            size of word vectors [100]
# ws             size of the context window [5]
# epoch          number of epochs [5]
# min_count      minimal number of word occurences [5]
# neg            number of negatives sampled [5]
# word_ngrams    max length of word ngram [1]
# loss           loss function {ns, hs, softmax} [ns]
# bucket         number of buckets [2000000]
# minn           min length of char ngram [3]
# maxn           max length of char ngram [6]
# thread         number of threads [12]
# t              sampling threshold [0.0001]
# silent         disable the log output from the C++ extension [1]
# encoding       specify input_file encoding [utf-8]

In [None]:
import fasttext
# CBOW model
model = fasttext.skipgram(
    input_file = 'data/all_comments.csv', 
    output = 'skipgram_250_3ngram_fasttext_model',
    lr = 0.015,             #learning rate [0.05]
    lr_update_rate = 1, #change the rate of updates for the learning rate [100]
    dim = 150,            #size of word vectors [100]
    ws = 6,             #size of the context window [5]
    epoch = 5,          #number of epochs [5]
    min_count = 5,      #minimal number of word occurences [5]
    neg = 10,            #number of negatives sampled [5]
    word_ngrams = 2,    #max length of word ngram [1]
    loss = 'ns',           #loss function {ns, hs, softmax} [ns]
    bucket = 2000000,         #number of buckets [2000000]
    minn = 3,           #min length of char ngram [3]
    maxn = 7,           #max length of char ngram [6]
    thread = 12,         #number of threads [12]
    t = 0.0001,              #sampling threshold [0.0001]
    silent = 0,         #disable the log output from the C++ extension [1]
    encoding = 'utf-8'       #specify input_file encoding [utf-8]
)

In [None]:
model = fasttext.load_model('cbow_fasttext_model.bin')

#### <b> Training A Classifier </b>

- <b>input_file</b>---------------training file path (required)<br>
- <b>output</b>--------------------output file path (required)<br>
- <b>label_prefix</b>--------------label prefix ['__label__']<br>
- <b>lr</b>------------------------learning rate [0.1] <br>
- <b>lr_update_rate</b>------------change the rate of updates for the learning rate [100]<br>
- <b>dim</b>-----------------------size of word vectors [100]<br>
- <b>ws</b>------------------------size of the context window [5]<br>
- <b>epoch</b>---------------------number of epochs [5]<br>
- <b>min_count</b>-----------------minimal number of word occurences [1]<br>
- <b>neg</b>-----------------------number of negatives sampled [5]<br>
- <b>word_ngrams</b>---------------max length of word ngram [1]<br>
- <b>loss</b>----------------------loss function {ns, hs, softmax} [softmax]<br>
- <b>bucket</b>--------------------number of buckets [0]<br>
- <b>minn</b>----------------------min length of char ngram [0]<br>
- <b>maxn</b>----------------------max length of char ngram [0]<br>
- <b>thread</b>--------------------number of threads [12]<br>
- <b>t</b>-------------------------sampling threshold [0.0001]<br>
- <b>silent</b>--------------------disable the log output from the C++ extension [1]<br>
- <b>encoding</b>------------------specify input_file encoding [utf-8]<br>
- <b>pretrained_vectors</b>--------pretrained word vectors (.vec file) for supervised learning []<br>

In [None]:
import fasttext
classifier = fasttext.supervised(input_file = 'data/train_comments.csv',
                                 output = 'initial_classifier_commas',
                                 label_prefix ='__label__',
                                 lr = 0.1,
                                 lr_update_rate = 100,
                                 word_ngrams = 3,
                                 bucket = 2000000,
                                 dim = 100,
                                 ws = 10,
                                 epoch = 10,
                                 min_count = 10,
                                 neg = 5,
                                 pretrained_vectors = 'word_vector_models/crawl-300d-2M.vec',
                                 silent = 0
                                )

In [19]:
import fasttext
parameters = [1,3,5,10,15,50]
training_score = []
for test_param in parameters:
    classifier = fasttext.supervised(input_file = 'data/train_comments.csv',
                                 output = 'pre-trained-vectors',
                                 label_prefix ='__label__',
                                  lr = 0.1,
                                  lr_update_rate = 100,
                                  word_ngrams = 2,
                                  bucket = 2000000,
                                  dim = 300,
                                  ws = 6,
                                 epoch = 7,
                                 min_count = 5,
                                 neg = test_param,
                                 pretrained_vectors = 'word_vector_models/crawl-300d-2M.vec',
                                 silent = 0
                                )
    training = classifier.test('data/train_comments.csv')
    validation = classifier.test('data/validate_comments.csv')
    training_score += [{"num_epochs":num_epochs,
                       "training_precision":training.precision,
                       "training_recall":training.recall,
                       "validation_precision":validation.precision,
                       "validation_recall":validation.recall}]
    print("Completed Iteration: Parameter = %s: Training Precision = %s - Validation Precision = %s" % (test_param,training.precision,validation.precision))

Completed Iteration: Parameter = 1: Training Precision = 0.9870140143175437 - Validation Precision = 0.9564
Completed Iteration: Parameter = 3: Training Precision = 0.98713266799383 - Validation Precision = 0.95675
Completed Iteration: Parameter = 5: Training Precision = 0.98713266799383 - Validation Precision = 0.9564
Completed Iteration: Parameter = 10: Training Precision = 0.9875413639899276 - Validation Precision = 0.95645
Completed Iteration: Parameter = 15: Training Precision = 0.9873831590882124 - Validation Precision = 0.9569
Completed Iteration: Parameter = 50: Training Precision = 0.987396342830022 - Validation Precision = 0.9565


In [None]:
validation_scores = pd.DataFrame.from_dict(training_score)

In [None]:
validation_scores

In [None]:
print('P@1:', validation.precision)
print('R@1:', validation.recall)
print('Number of examples:', validation.nexamples)

#### <b> Getting Predictions </b>

In [None]:
# classifier.labels                  # List of labels
# classifier.label_prefix            # Prefix of the label
# classifier.dim                     # Size of word vector
# classifier.ws                      # Size of context window
# classifier.epoch                   # Number of epochs
# classifier.min_count               # Minimal number of word occurences
# classifier.neg                     # Number of negative sampled
# classifier.word_ngrams             # Max length of word ngram
# classifier.loss_name               # Loss function name
# classifier.bucket                  # Number of buckets
# classifier.minn                    # Min length of char ngram
# classifier.maxn                    # Max length of char ngram
# classifier.lr_update_rate          # Rate of updates for the learning rate
# classifier.t                       # Value of sampling threshold
# classifier.encoding                # Encoding that used by classifier
# classifier.test(filename, k)       # Test the classifier
# classifier.predict(texts, k)       # Predict the most likely label
# classifier.predict_proba(texts, k) # Predict the most likely label include their probability

In [9]:
filename = 'data/test_comments.txt' #Read In Comments for predictions
with open(filename) as f:
    content = f.readlines()

In [10]:
labels = classifier.predict_proba(content, 7)
sample_submission = pd.read_csv('../submissions/sample_submission.csv') #Read in sample output file
labelsdict = [dict(row) for row in labels] # Convert list output to dicts
output = pd.DataFrame.from_dict(labelsdict) # Create PD Dataframe from labels
output['id'] = sample_submission['id'] #Ad ID Column
output = output.drop('clean',axis =1) #Drop unused clean column
output = output[sample_submission.columns]

In [11]:
output.to_csv('submissions/using_pretrained_vecs_300ep.csv', index=False)

In [12]:
hello

NameError: name 'hello' is not defined