# Use saved pickled classifier to generate labels

***REQUIRE PYTHON 2***
***DO NOT USE PYTHON 3***

This classifier is heavily (99%+) adapted from: Thomas Davidson, Dana Warmsley, Michael Macy, and Ingmar Weber. 2017. “Automated Hate Speech Detection and the Problem of Offensive Language.” ICWSM. Please cite the paper when using the following code.

This code is used to:

- Load the pre-trained classifier and associated files
- Transform new input data into the correct format for the classifier.
- Run the classifier on the transformed data and return results.

## Loading packages

In [None]:
# -*- coding: utf-8 -*-
"""


@author: Kokil
"""


import pandas as pd
import csv
import numpy as np
import sys
import time
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import GaussianNB, MultinomialNB, BernoulliNB
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.ensemble import AdaBoostClassifier, RandomForestClassifier
from sklearn.svm import SVC, LinearSVC
from sklearn.metrics import classification_report
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import KFold, StratifiedKFold
from sklearn.preprocessing import StandardScaler
from sklearn.feature_selection import RFE
from sklearn.utils import class_weight
from sklearn import metrics
from sklearn.model_selection import train_test_split
import numpy as np
from scipy.sparse import csr_matrix
import spacy
from spacy.lang.en import English
from spacy.lang.en.stop_words import STOP_WORDS
import sys
import os
import warnings
import pandas as pd
import json
import string
import re
import nltk
import random
import matplotlib.pyplot as plt
from nltk.tokenize import RegexpTokenizer
#!pip install spacy
#!python3 -m spacy download en_core_web_lg 
#!python3 -m spacy download en_core_web_sm

#!pip install convokit
from convokit import Corpus, Speaker, Utterance
from convokit import download
from convokit import TextParser
from convokit import PolitenessStrategies
import spacy
import nltk
#nltk.download('averaged_perceptron_tagger')
import sklearn.externals
import joblib

In [None]:
nlp = spacy.load("en_core_web_sm") 

def is_number(tok):
    try:
        float(tok)
        return True
    except ValueError:
        return False

def spacy_tokenizer(text):
    return [tok.text if not is_number(tok.text) else '_NUM_' for tok in nlp(text)]


In [None]:
def extract_harbingers(df, X_col):

    with open('/home/kokil/feature_extraction/data/2015_Diplomacy_lexicon.json') as f:
        features = json.loads(f.readline())

    for feature in features:
        harbingers = [harbinger.encode('ascii', 'ignore').decode('ascii').lower() for harbinger in features[feature]]
        features[feature] = harbingers

    def clean_text(text):
        text = str(text)
        text = text.replace('\'', '')
        text = text.lower()
        text = text.replace('{html}',"") 
        text = re.sub(re.compile('<.*?>'), '', text)
        text = re.sub(r'http\S+', '', text)
        text = re.sub('[0-9]+', '', text)
        tokenizer = RegexpTokenizer(r'\w+')
        tokens = tokenizer.tokenize(text)  
        text = " ".join(tokens)
        return text

    def get_feature_frequency(text, feature):
        count = 0
        for harbinger in features[feature]:
            count += text.count(harbinger)
        return count
    
    df['clean_text'] = df.apply(lambda row: clean_text(row[X_col]), axis=1)
    for feature in features:
        df[feature] = df.apply(lambda row: get_feature_frequency(row['clean_text'], feature), axis=1)

In [None]:
def preprocess(text_string):
        
        space_pattern = '\s+'
        giant_url_regex = ('http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|''[!*\(\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+')
        mention_regex = '@[\w\-]+'
        parsed_text = re.sub(space_pattern, ' ', text_string)
        parsed_text = re.sub(giant_url_regex, '', parsed_text)
        parsed_text = re.sub(mention_regex, '', parsed_text)
    #parsed_text = parsed_text.code("utf-8", errors='ignore')
        return parsed_text


#nlp = en_core_web_sm.load()
#spacy.load("en_core_web_lg")
#spacy.load("en_core_web_sm")
ps = PolitenessStrategies()
spacy_nlp = spacy.load('en_core_web_sm', disable=['ner'])
cols = list(ps.transform_utterance("hello, could you please help me proofread this article?", spacy_nlp=spacy_nlp).meta['politeness_strategies'])

def extract_politeness_feats(df, X_col):

    def extract_politeness_helper(row):
        utt = ps.transform_utterance(row[X_col], spacy_nlp=spacy_nlp)
        feats = [utt.meta['politeness_strategies'][x] for x in cols]
        return pd.Series(feats)

    df[cols] = df.apply(extract_politeness_helper, axis=1)



In [None]:
# List harbingers, liwc and politeness features
import json
with open('/home/kokil/feature_extraction/data/2015_Diplomacy_lexicon.json') as f:
    harb_dict = json.loads(f.readline())
politeness_dict = pd.read_csv('/home/kokil/feature_extraction/data/politeness_list.csv')
liwc_dict = pd.read_csv('/home/kokil/feature_extraction/data/liwc_list.csv')
X_cols = list(politeness_dict.columns) + list(liwc_dict.columns) + list(harb_dict.keys())

In [None]:
#!/usr/bin/env python
# -*- coding: utf-8 -*-

"""
happierfuntokenizer_v3

This code implements a basic, Twitter-aware tokenizer.

A tokenizer is a function that splits a string of text into words. In
Python terms, we map string and unicode objects into lists of unicode
objects.

There is not a single right way to do tokenizing. The best method
depends on the application.  This tokenizer is designed to be flexible
and this easy to adapt to new domains and tasks.  The basic logic is
this:

1. The tuple regex_strings defines a list of regular expression
   strings.

2. The regex_strings strings are put, in order, into a compiled
   regular expression object called word_re.

3. The tokenization is done by word_re.findall(s), where s is the
   user-supplied string, inside the tokenize() method of the class
   Tokenizer.

4. When instantiating Tokenizer objects, there is a single option:
   preserve_case.  By default, it is set to True. If it is set to
   False, then the tokenizer will downcase everything except for
   emoticons.

The __main__ method illustrates by tokenizing a few examples.

I've also included a Tokenizer method tokenize_random_tweet(). If the
twitter library is installed (http://code.google.com/p/python-twitter/)
and Twitter is cooperating, then it should tokenize a random
English-language tweet.
"""

######################################################################

import re
import html.entities
######################################################################
# The following strings are components in the regular expression
# that is used for tokenizing. It's important that phone_number
# appears first in the final regex (since it can contain whitespace).
# It also could matter that tags comes after emoticons, due to the
# possibility of having text like
#
#     <:| and some text >:)
#
# Most imporatantly, the final element should always be last, since it
# does a last ditch whitespace-based tokenization of whatever is left.

# This particular element is used in a couple ways, so we define it
# with a name:
emoticon_string = r"""
    (?:
      [<>]?
      [:;=8>]                    # eyes
      [\-o\*\']?                 # optional nose
      [\)\]\(\[dDpPxX/\:\}\{@\|\\] # mouth      
      |
      [\)\]\(\[dDpPxX/\:\}\{@\|\\] # mouth
      [\-o\*\']?                 # optional nose
      [:;=8<]                    # eyes
      [<>]?
      |
      <[/\\]?3                         # heart(added: has)
      |
      \(?\(?\#?                   #left cheeck
      [>\-\^\*\+o\~]              #left eye
      [\_\.\|oO\,]                #nose
      [<\-\^\*\+o\~]              #right eye
      [\#\;]?\)?\)?               #right cheek
    )"""

# The components of the tokenizer:
regex_strings = (
    # Phone numbers:
    r"""
    (?:
      (?:            # (international)
        \+?[01]
        [\-\s.]*
      )?            
      (?:            # (area code)
        [\(]?
        \d{3}
        [\-\s.\)]*
      )?    
      \d{3}          # exchange
      [\-\s.]*   
      \d{4}          # base
    )"""
    ,
    # Emoticons:
    emoticon_string
    ,    
    # http:
    # Web Address:
    r"""(?:(?:http[s]?\:\/\/)?(?:[\w\_\-]+\.)+(?:com|net|gov|edu|info|org|ly|be|gl|co|gs|pr|me|cc|us|gd|nl|ws|am|im|fm|kr|to|jp|sg)(?:\/[\s\b$])?)"""
    ,
    r"""(?:http[s]?\:\/\/)"""   #need to capture it alone sometimes
    ,
    #command in parens:
    r"""(?:\[[\w_]+\])"""   #need to capture it alone sometimes
    ,
    # HTTP GET Info
    r"""(?:\/\w+\?(?:\;?\w+\=\w+)+)"""
    ,
    # HTML tags:
    r"""(?:<[^>]+\w=[^>]+>|<[^>]+\s\/>|<[^>\s]+>?|<?[^<\s]+>)"""
    #r"""(?:<[^>]+\w+[^>]+>|<[^>\s]+>?|<?[^<\s]+>)"""
    ,
    # Twitter username:
    r"""(?:@[\w_]+)"""
    ,
    # Twitter hashtags:
    r"""(?:\#+[\w_]+[\w\'_\-]*[\w_]+)"""
    ,
    # Remaining word types:
    r"""
    (?:[\w][\w'\-_]+[\w])       # Words with apostrophes or dashes.
    |
    (?:[+\-]?\d+[,/.:-]\d+[+\-]?)  # Numbers, including fractions, decimals.
    |
    (?:[\w_]+)                     # Words without apostrophes or dashes.
    |
    (?:\.(?:\s*\.){1,})            # Ellipsis dots. 
    |
    (?:\S)                         # Everything else that isn't whitespace.
    """
    )

######################################################################
# This is the core tokenizing regex:
    
word_re = re.compile(r"""(%s)""" % "|".join(regex_strings), re.VERBOSE | re.I | re.UNICODE)

# The emoticon string gets its own regex so that we can preserve case for them as needed:
emoticon_re = re.compile(regex_strings[1], re.VERBOSE | re.I | re.UNICODE)

# These are for regularizing HTML entities to Unicode:
html_entity_digit_re = re.compile(r"&#\d+;")
html_entity_alpha_re = re.compile(r"&\w+;")
amp = "&amp;"

hex_re = re.compile(r'\\x[0-9a-z]{1,4}')

######################################################################

class Tokenizer:
    def __init__(self, preserve_case=False, use_unicode=True):
        self.preserve_case = preserve_case
        self.use_unicode = use_unicode

    def tokenize(self, s):
        """
        Argument: s -- any string or unicode object
        Value: a tokenize list of strings; conatenating this list returns the original string if preserve_case=False
        """        
        # Try to ensure unicode:
        if self.use_unicode:
            try:
                s = str(s)
            except UnicodeDecodeError:
                s = str(s).encode('string_escape')
                s = str(s)
        # Fix HTML character entitites:
        s = self.__html2unicode(s)
        s = self.__removeHex(s)
        # Tokenize:
        words = word_re.findall(s)
        #print words #debug
        # Possible alter the case, but avoid changing emoticons like :D into :d:
        if not self.preserve_case:            
            words = map((lambda x : x if emoticon_re.search(x) else x.lower()), words)
        
        return words


    def __html2unicode(self, s):
        """
        Internal metod that seeks to replace all the HTML entities in
        s with their corresponding unicode characters.
        """
        # First the digits:
        ents = set(html_entity_digit_re.findall(s))
        if len(ents) > 0:
            for ent in ents:
                entnum = ent[2:-1]
                try:
                    entnum = int(entnum)
                    s = s.replace(ent, chr(entnum))	
                except:
                    pass
        # Now the alpha versions:
        ents = set(html_entity_alpha_re.findall(s))
        ents = filter((lambda x : x != amp), ents)
        for ent in ents:
            entname = ent[1:-1]
            try:            
                s = s.replace(ent, chr(htmlentitydefs.name2codepoint[entname]))
            except:
                pass                    
            s = s.replace(amp, " and ")
        return s

    def __removeHex(self, s):
        return hex_re.sub(' ', s)


In [None]:
import pandas as pd
from nltk import word_tokenize
from nltk.stem.snowball import SnowballStemmer
import requests
from io import StringIO
def LeXmo(text,document,dictionary):

    '''
      Takes text and adds if to a dictionary with 10 Keys  for each of the 10 emotions in the NRC Emotion Lexicon,
      each dictionay contains the value of the text in that emotions divided to the text word count
      INPUT: string
      OUTPUT: dictionary with the text and the value of 10 emotions
      '''
    reponse = ""
    choice = 0
    df = pd.DataFrame()
    emodic = {'text': text}
    if(dictionary == "all"):
        #first nrc
        choice = 1
    if(dictionary == "nrc" or choice == 1):
        response = requests.get('https://raw.github.com/dinbav/LeXmo/master/NRC-Emotion-Lexicon-Wordlevel-v0.92.txt')
        nrc = StringIO(response.text)


#        emodic = {'anger': [], 'anticipation': [], 'disgust': [], 'fear': [], 'joy': [], 'negative': [],                  'positive': [], 'sadness': [], 'surprise': [], 'trust': []}
        thisdic =  {'anger': [], 'anticipation': [], 'disgust': [], 'fear': [], 'joy': [], 'negative': [],'positive': [], 'sadness': [], 'surprise': [], 'trust': []}
        emodic = Merge(emodic,thisdic)

        lexicon = pd.read_csv(nrc,
                            names=["word", "emotion", "association"],
                            sep=r'\t', engine='python')
        df = df.append(lexicon)

    if(dictionary == "liwc" or choice == 1):
        #response = requests.get('/home/kokil/feature_extraction/data/liwc2015.txt')
        #liwc = StringIO(response.text)
        liwc = '/home/kokil/feature_extraction/data/liwc2015.txt'


       # emodic = {'text': text, 'PPRON': [],'BODY': [],'WE': [],'DEATH': [],'FOCUSFUTURE': [],'FEEL': [],'INTERROG': [],'NUMBER': [],'POSEMO': [],'NEGATE': [],'QUANT': [],'THEY': [],'AFFECT': [],'RELATIV': [],'HOME': [],'CONJ': [],'COGPROC': [],'SEXUAL': [],'AUXVERB': [],'SHEHE': [],'BIO': [],'DIFFER': [],'POWER': [],'NETSPEAK': [],'INFORMAL': [],'CAUSE': [],'FILLER': [],'INSIGHT': [],'LEISURE': [],'NEGEMO': [],'MOTION': [],'SEE': [],'FOCUSPAST': [],'ANGER': [],'ARTICLE': [],'NONFLU': [],'MALE': [],'WORK': [],'FRIEND': [],'FUNCTION': [],'RISK': [],'FAMILY': [],'SPACE': [],'I': [],'IPRON': [],'SOCIAL': [],'ASSENT': [],'DRIVES': [],'PERCEPT': [],'VERB': [],'HEAR': [],'FEMALE': [],'DISCREP': [],'YOU': [],'ADJ': [],'ACHIEVE': [],'RELIG': [],'TENTAT': [],'COMPARE': [],'ADVERB': [],'PRONOUN': [],'MONEY': [],'FOCUSPRESENT': [],'INGEST': [],'AFFILIATION': [],'SWEAR': [],'HEALTH': [],'SAD': [],'TIME': [],'REWARD': [],'ANX': [],'PREP': [],'CERTAIN': []}
        thisdic = {'PPRON': [],'BODY': [],'WE': [],'DEATH': [],'FOCUSFUTURE': [],'FEEL': [],'INTERROG': [],'NUMBER': [],'POSEMO': [],'NEGATE': [],'QUANT': [],'THEY': [],'AFFECT': [],'RELATIV': [],'HOME': [],'CONJ': [],'COGPROC': [],'SEXUAL': [],'AUXVERB': [],'SHEHE': [],'BIO': [],'DIFFER': [],'POWER': [],'NETSPEAK': [],'INFORMAL': [],'CAUSE': [],'FILLER': [],'INSIGHT': [],'LEISURE': [],'NEGEMO': [],'MOTION': [],'SEE': [],'FOCUSPAST': [],'ANGER': [],'ARTICLE': [],'NONFLU': [],'MALE': [],'WORK': [],'FRIEND': [],'FUNCTION': [],'RISK': [],'FAMILY': [],'SPACE': [],'I': [],'IPRON': [],'SOCIAL': [],'ASSENT': [],'DRIVES': [],'PERCEPT': [],'VERB': [],'HEAR': [],'FEMALE': [],'DISCREP': [],'YOU': [],'ADJ': [],'ACHIEVE': [],'RELIG': [],'TENTAT': [],'COMPARE': [],'ADVERB': [],'PRONOUN': [],'MONEY': [],'FOCUSPRESENT': [],'INGEST': [],'AFFILIATION': [],'SWEAR': [],'HEALTH': [],'SAD': [],'TIME': [],'REWARD': [],'ANX': [],'PREP': [],'CERTAIN': []}
        emodic = Merge(emodic,thisdic)
        lexicon = pd.read_csv(liwc,
                            names=["word", "emotion", "association"],
                            sep=r'\t', engine='python')
        df = df.append(lexicon)
    
    if(dictionary == "delib" or choice == 1):
        delib = '/home/kokil/feature_extraction/data/dd_delib.txt'
        #response = requests.get('/home/kokil/feature_extraction/data/dd_delib.txt')
        #delib = StringIO(response.text)
        


       # emodic = {'text': text, 'EMP_RES': [],'UNCIVIL_ABUSE': [],'CONSTRUCTIVENESS': [],'JUSTIFICATION': [],'RECIPROCITY': [],'JUST_EXT': [],'RELEVANCE': [],'JUST_INT': []}
        thisdic =  {'EMP_RES': [],'UNCIVIL_ABUSE': [],'CONSTRUCTIVENESS': [],'JUSTIFICATION': [],'RECIPROCITY': [],'JUST_EXT': [],'RELEVANCE': [],'JUST_INT': []}
        emodic = Merge(emodic,thisdic)
        

        lexicon = pd.read_csv(delib,
                            names=["word", "emotion", "association"],
                            sep=r'\t', engine='python')
        df = df.append(lexicon)
    if(dictionary == "hate" or choice == 1):
        hate = '/home/kokil/feature_extraction/data/incivilities.txt'
        #response = requests.get('/home/kokil/feature_extraction/data/incivilities.txt')
        #hate = StringIO(response.text)
        


        #emodic = {'text': text, 'SWEAR': [],'UNCIV': [],'OFFEN': []}
        thisdic = {'UNCIV': [],'OFFEN': []}
        emodic = Merge(emodic,thisdic)

        lexicon = pd.read_csv(hate,           names=["word", "emotion", "association"],             sep=r'\t', engine='python')
        df = df.append(lexicon)
        
    df = df.drop_duplicates(subset=['word', 'emotion'])
    df.reset_index()
    emolex_words = df.pivot(index='word',
                                   columns='emotion',
                                   values='association').reset_index()
    emolex_words.drop(emolex_words.index[0])

    categories = emolex_words.columns.drop('word')

    stemmer = SnowballStemmer("english")

    rows_list = []
    word_count = len(document)
    for word in document:
            word = stemmer.stem(word.lower())

            emo_score = (emolex_words[emolex_words.word == word])
            rows_list.append(emo_score)

            
    df = pd.concat(rows_list)
    df.reset_index(drop=True)

    for category in list(categories):
        emodic[category] = df[category].sum() / word_count

    return emodic

In [1]:
import csv

tokenizer = Tokenizer()

def tokenize_messages(filename,col_text,col_msgid):
    with open(filename,encoding="utf-8") as corpus:
            reader = csv.reader(corpus)
    #splitsfile = open('C:/Users/User/Dropbox/Content Analysis/Corpus/Full Corpus/fullcorpus_split.csv','a',newline='',encoding="utf-8")
    #f_revs = csv.writer(splitsfile)
    #f_revs.writerow(["message_id","SITE ID","message","Like Count","postlength"])
    
            rows_list = []
            for row in reader:
                message = row[col_text]
                tokenizer = Tokenizer(preserve_case=True)
                words = tokenizer.tokenize(message.lower())
                #print(words)
                totalGrams=0
                freqs = dict()    
                totalChars = 0
                gram = '' 
                for n in range (1,4):
                    for i in range(0,(len(words) - n)+1):
                        totalGrams += 1
                        gram = ' '.join(words[i:i+n])
                        try:
                            freqs[gram] = 1
                        except:
                            print("error")
                freqs["message_id"]=row[col_msgid]
                rows_list.append(freqs)
            df = pd.DataFrame(rows_list) 
            df= df.replace(np.nan, 0)
            print("Writing tokenized messages to csv...")
            timestr = time.strftime("%Y%m%d-%H%M")
            #print timestr
            df.to_csv("tokenized_messages_"+timestr+".csv")
            return df
            
############################

def emolize_messages(filename,col_text,col_msgid,choice):
    with open(filename,encoding="utf-8") as corpus:
            reader = csv.reader(corpus)
    #splitsfile = open('C:/Users/User/Dropbox/Content Analysis/Corpus/Full Corpus/fullcorpus_split.csv','a',newline='',encoding="utf-8")
    #f_revs = csv.writer(splitsfile)
    #f_revs.writerow(["message_id","SITE ID","message","Like Count","postlength"])
    
            rows_list = []
            for row in reader:
                message = row[col_text]
                tokenizer = Tokenizer(preserve_case=True)
                words = tokenizer.tokenize(message.lower())
                emodic = LeXmo(message.lower(),words,choice)
                print(emodic)
                rows_list.append(emodic)
                #print(emodic)
            df = pd.DataFrame(rows_list) 
            df= df.replace(np.nan, 0)
            print("Writing emolized messages to csv...")
            timestr = time.strftime("%Y%m%d-%H%M")
            #print timestr
            df.to_csv(filename+"_"+choice+"_"+timestr+".csv")
            return df
            
############################
def extract_counts(df,X_col):
            vectorizer = CountVectorizer(tokenizer=spacy_tokenizer, stop_words=STOP_WORDS, strip_accents='unicode')
            corpus = list(df[X_col].str.lower())
            X = vectorizer.fit_transform(corpus)
            df = df.join(pd.DataFrame(X.toarray()).add_prefix('count_'))
            df.to_csv(os.path.join(MODIFIED_DATA, 'counts.csv'))
def extract_tfidf(df,X_col):
            vectorizer = TfidfVectorizer(tokenizer=spacy_tokenizer, stop_words=STOP_WORDS, strip_accents='unicode',max_features = 10000)
            corpus = list(df[X_col].str.lower())
            X = vectorizer.fit_transform(corpus)
            df = df.join(pd.DataFrame(X.toarray()).add_prefix('tfidf_'))
            df.to_csv(os.path.join(MODIFIED_DATA, 'tfidf.csv'))
            return df
def extract_tfidf_and_pos(df,X_col):
            print("Loading other information...")
            vectorizer = joblib.load('final_tfidf_vectorizer.pkl')
            idf_vector = joblib.load('final_idf_vectorizer.pkl')
            pos_vectorizer = joblib.load('final_pos_vectorizer.pkl')
            tweets = df[X_col]
            
            X = transform_inputs(df[X_col], tf_vectorizer, idf_vector, pos_vectorizer)
            featurenames = vectorizer.get_feature_names() + pos_vectorizer.get_feature_names()
            df_tfidf = pd.DataFrame(tfidfs.toarray(), columns= featurenames)
            df = df.join(df_tfidf)
        
            return df
        #print(tokens[i],tag_list[i])

NameError: name 'Tokenizer' is not defined

In [None]:

# code
# Python code to merge dict using a single 
# expression
def Merge(dict1, dict2):
    res = {**dict1, **dict2}
    return res

def extract_feats(filename, X_col):
    df = emolize_messages( filename,1,0,"all")
    extract_harbingers(df, X_col)
    extract_politeness_feats(df, X_col)
    df = extract_tfidf_and_pos(df,X_col)
    timestr = time.strftime("%Y%m%d-%H%M")
    #print timestr
    df.to_csv(filename+"_allfeats_"+timestr+".csv")
    


In [None]:
filename = "data/sample.csv"
DATA_PATH = "data/sample.csv"
OUTPUT_DIR = 'data'     # You'll get 2 directories here, one will have t
X_col = 'text'  # Name of X column (string)
y_col = 'label'        # Name of y column (0/1)
MODIFIED_DATA = os.path.join(OUTPUT_DIR, 'modified_data')
os.makedirs(MODIFIED_DATA, exist_ok=True)
os.makedirs(OUTPUT_DIR, exist_ok=True)
extract_feats(filename,"text")

model = joblib.load('final_model.pkl')

feature_start, num_ftrs = 16,1496
X = df_newdata.iloc[:, feature_start:feature_start+num_ftrs]
print(df_newdata.shape)
X_scaled = all_scalers[feature].transform(X)
y = predictions(X_scaled,model)
unlabeled_df['predicted_' + feature] = y
print("Printing predicted values:")
with open('generated_labels.csv', 'a',newline='') as csvfile: 
    f = csv.writer(csvfile, delimiter='\t', quotechar='"', quoting=csv.QUOTE_MINIMAL)
    for i,t in enumerate(tweets):
         f.writerow([class_to_name(y[i])])                      
#        for i,t in enumerate(tweets):
#            print(t)
#            print(class_to_name(y[i]))
    