In [None]:
import numpy as np 
import pandas as pd 
import time, os, sys, re, gc, json, multiprocessing
import warnings
warnings.filterwarnings('ignore')
import emoji, random, unicodedata

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns
from tqdm import tqdm_notebook

## Data Description
```python
{
    "org123": {
        "agriculture": 1,
        "cross": 2,
        "education": 3,
        "food": 4,
        "health": 5,
        "livelihood": 6,
        "logistic": 7,
        "nfi": 8,
        "nutrition": 9,
        "protection": 10,
        "shelter": 11,
        "wash": 12
    },
    "org4": {
        "Child Protection": 101,
        "Early Recovery and Livelihood": 102,
        "Education": 103,
        "Food": 104,
        "GBV": 105,
        "Health": 106,
        "Logistic": 107,
        "Mine Action": 108,
        "Nutrition": 109,
        "Protection": 110,
        "Shelter and NFIs": 111,
        "WASH": 112
    }
}
```

In [None]:
org1 = pd.read_csv('data/org1_dev.csv')
org2 = pd.read_csv('data/org2_dev.csv')
org3 = pd.read_csv('data/org3_dev.csv')
test1 = pd.read_csv('data/org1_test.csv')
test2 = pd.read_csv('data/org2_test.csv')
test3 = pd.read_csv('data/org3_test.csv')

In [None]:
print("Length of ORG1 : ", org1.shape)
print("Length of ORG2 : ", org2.shape)
print("Length of ORG3 : ", org3.shape)

In [None]:
org1.head()

In [None]:
org1.language.unique()

In [None]:
org1.loc[org1.language == 'id']

In [None]:
org1.labels.nunique()

In [None]:
org_try = org1.copy()
org_try = org_try[:5]
org_try

In [None]:
for i in range(1,13):
    org_try[str(i)] = org_try['labels'].apply(lambda x: 1 if str(i) in x else 0)
org_try

In [None]:
for i in range(1,13): 
    org1[str(i)] = org1['labels'].apply(lambda x: 1 if str(i) in x else 0)
    org2[str(i)] = org1['labels'].apply(lambda x: 1 if str(i) in x else 0)
    org3[str(i)] = org1['labels'].apply(lambda x: 1 if str(i) in x else 0)
    test1[str(i)] = 0
    test2[str(i)] = 0
    test3[str(i)] = 0

In [None]:
org1.head()

So now we have one hot encoded values for the labels. We have to now ignore column labels and make predictions on the 12 created columns. 

In [None]:
df = org1.drop(['id', 'entry_original','labels','language','entry_translated'], axis=1)
counts = []
categories = list(df.columns.values)
for i in categories:
    counts.append((i, df[i].sum()))
df_stats = pd.DataFrame(counts, columns=['category', 'number_of_comments'])
df_stats

In [None]:
label_cols = [str(i) for i in range(1,13)]
label_cols

## Text Preprocessing

In [None]:
symbols_to_isolate = '.,?!-;*"…:—()%#$&_/@＼・ω+=”“[]^–>\\°<~•≠™ˈʊɒ∞§{}·τα❤☺ɡ|¢→̶`❥━┣┫┗Ｏ►★©―ɪ✔®\x96\x92●£♥➤´¹☕≈÷♡◐║▬′ɔː€۩۞†μ✒➥═☆ˌ◄½ʻπδηλσερνʃ✬ＳＵＰＥＲＩＴ☻±♍µº¾✓◾؟．⬅℅»Вав❣⋅¿¬♫ＣＭβ█▓▒░⇒⭐›¡₂₃❧▰▔◞▀▂▃▄▅▆▇↙γ̄″☹➡«φ⅓„✋：¥̲̅́∙‛◇✏▷❓❗¶˚˙）сиʿ✨。ɑ\x80◕！％¯−ﬂﬁ₁²ʌ¼⁴⁄₄⌠♭✘╪▶☭✭♪☔☠♂☃☎✈✌✰❆☙○‣⚓年∎ℒ▪▙☏⅛ｃａｓǀ℮¸ｗ‚∼‖ℳ❄←☼⋆ʒ⊂、⅔¨͡๏⚾⚽Φ×θ￦？（℃⏩☮⚠月✊❌⭕▸■⇌☐☑⚡☄ǫ╭∩╮，例＞ʕɐ̣Δ₀✞┈╱╲▏▕┃╰▊▋╯┳┊≥☒↑☝ɹ✅☛♩☞ＡＪＢ◔◡↓♀⬆̱ℏ\x91⠀ˤ╚↺⇤∏✾◦♬³の｜／∵∴√Ω¤☜▲↳▫‿⬇✧ｏｖｍ－２０８＇‰≤∕ˆ⚜☁'
symbols_to_delete = '\n🍕\r🐵😑\xa0\ue014\t\uf818\uf04a\xad😢🐶️\uf0e0😜😎👊\u200b\u200e😁عدويهصقأناخلىبمغر😍💖💵Е👎😀😂\u202a\u202c🔥😄🏻💥ᴍʏʀᴇɴᴅᴏᴀᴋʜᴜʟᴛᴄᴘʙғᴊᴡɢ😋👏שלוםבי😱‼\x81エンジ故障\u2009🚌ᴵ͞🌟😊😳😧🙀😐😕\u200f👍😮😃😘אעכח💩💯⛽🚄🏼ஜ😖ᴠ🚲‐😟😈💪🙏🎯🌹😇💔😡\x7f👌ἐὶήιὲκἀίῃἴξ🙄Ｈ😠\ufeff\u2028😉😤⛺🙂\u3000تحكسة👮💙فزط😏🍾🎉😞\u2008🏾😅😭👻😥😔😓🏽🎆🍻🍽🎶🌺🤔😪\x08‑🐰🐇🐱🙆😨🙃💕𝘊𝘦𝘳𝘢𝘵𝘰𝘤𝘺𝘴𝘪𝘧𝘮𝘣💗💚地獄谷улкнПоАН🐾🐕😆ה🔗🚽歌舞伎🙈😴🏿🤗🇺🇸мυтѕ⤵🏆🎃😩\u200a🌠🐟💫💰💎эпрд\x95🖐🙅⛲🍰🤐👆🙌\u2002💛🙁👀🙊🙉\u2004ˢᵒʳʸᴼᴷᴺʷᵗʰᵉᵘ\x13🚬🤓\ue602😵άοόςέὸתמדףנרךצט😒͝🆕👅👥👄🔄🔤👉👤👶👲🔛🎓\uf0b7\uf04c\x9f\x10成都😣⏺😌🤑🌏😯ех😲Ἰᾶὁ💞🚓🔔📚🏀👐\u202d💤🍇\ue613小土豆🏡❔⁉\u202f👠》कर्मा🇹🇼🌸蔡英文🌞🎲レクサス😛外国人关系Сб💋💀🎄💜🤢َِьыгя不是\x9c\x9d🗑\u2005💃📣👿༼つ༽😰ḷЗз▱ц￼🤣卖温哥华议会下降你失去所有的钱加拿大坏税骗子🐝ツ🎅\x85🍺آإشء🎵🌎͟ἔ油别克🤡🤥😬🤧й\u2003🚀🤴ʲшчИОРФДЯМюж😝🖑ὐύύ特殊作戦群щ💨圆明园קℐ🏈😺🌍⏏ệ🍔🐮🍁🍆🍑🌮🌯🤦\u200d𝓒𝓲𝓿𝓵안영하세요ЖљКћ🍀😫🤤ῦ我出生在了可以说普通话汉语好极🎼🕺🍸🥂🗽🎇🎊🆘🤠👩🖒🚪天一家⚲\u2006⚭⚆⬭⬯⏖新✀╌🇫🇷🇩🇪🇮🇬🇧😷🇨🇦ХШ🌐\x1f杀鸡给猴看ʁ𝗪𝗵𝗲𝗻𝘆𝗼𝘂𝗿𝗮𝗹𝗶𝘇𝗯𝘁𝗰𝘀𝘅𝗽𝘄𝗱📺ϖ\u2000үսᴦᎥһͺ\u2007հ\u2001ɩｙｅ൦ｌƽｈ𝐓𝐡𝐞𝐫𝐮𝐝𝐚𝐃𝐜𝐩𝐭𝐢𝐨𝐧Ƅᴨןᑯ໐ΤᏧ௦Іᴑ܁𝐬𝐰𝐲𝐛𝐦𝐯𝐑𝐙𝐣𝐇𝐂𝐘𝟎ԜТᗞ౦〔Ꭻ𝐳𝐔𝐱𝟔𝟓𝐅🐋ﬃ💘💓ё𝘥𝘯𝘶💐🌋🌄🌅𝙬𝙖𝙨𝙤𝙣𝙡𝙮𝙘𝙠𝙚𝙙𝙜𝙧𝙥𝙩𝙪𝙗𝙞𝙝𝙛👺🐷ℋ𝐀𝐥𝐪🚶𝙢Ἱ🤘ͦ💸ج패티Ｗ𝙇ᵻ👂👃ɜ🎫\uf0a7БУі🚢🚂ગુજરાતીῆ🏃𝓬𝓻𝓴𝓮𝓽𝓼☘﴾̯﴿₽\ue807𝑻𝒆𝒍𝒕𝒉𝒓𝒖𝒂𝒏𝒅𝒔𝒎𝒗𝒊👽😙\u200cЛ‒🎾👹⎌🏒⛸公寓养宠物吗🏄🐀🚑🤷操美𝒑𝒚𝒐𝑴🤙🐒欢迎来到阿拉斯ספ𝙫🐈𝒌𝙊𝙭𝙆𝙋𝙍𝘼𝙅ﷻ🦄巨收赢得白鬼愤怒要买额ẽ🚗🐳𝟏𝐟𝟖𝟑𝟕𝒄𝟗𝐠𝙄𝙃👇锟斤拷𝗢𝟳𝟱𝟬⦁マルハニチロ株式社⛷한국어ㄸㅓ니͜ʖ𝘿𝙔₵𝒩ℯ𝒾𝓁𝒶𝓉𝓇𝓊𝓃𝓈𝓅ℴ𝒻𝒽𝓀𝓌𝒸𝓎𝙏ζ𝙟𝘃𝗺𝟮𝟭𝟯𝟲👋🦊多伦🐽🎻🎹⛓🏹🍷🦆为和中友谊祝贺与其想象对法如直接问用自己猜本传教士没积唯认识基督徒曾经让相信耶稣复活死怪他但当们聊些政治题时候战胜因圣把全堂结婚孩恐惧且栗谓这样还♾🎸🤕🤒⛑🎁批判检讨🏝🦁🙋😶쥐스탱트뤼도석유가격인상이경제황을렵게만들지않록잘관리해야합다캐나에서대마초와화약금의품런성분갈때는반드시허된사용🔫👁凸ὰ💲🗯𝙈Ἄ𝒇𝒈𝒘𝒃𝑬𝑶𝕾𝖙𝖗𝖆𝖎𝖌𝖍𝖕𝖊𝖔𝖑𝖉𝖓𝖐𝖜𝖞𝖚𝖇𝕿𝖘𝖄𝖛𝖒𝖋𝖂𝕴𝖟𝖈𝕸👑🚿💡知彼百\uf005𝙀𝒛𝑲𝑳𝑾𝒋𝟒😦𝙒𝘾𝘽🏐𝘩𝘨ὼṑ𝑱𝑹𝑫𝑵𝑪🇰🇵👾ᓇᒧᔭᐃᐧᐦᑳᐨᓃᓂᑲᐸᑭᑎᓀᐣ🐄🎈🔨🐎🤞🐸💟🎰🌝🛳点击查版🍭𝑥𝑦𝑧ＮＧ👣\uf020っ🏉ф💭🎥Ξ🐴👨🤳🦍\x0b🍩𝑯𝒒😗𝟐🏂👳🍗🕉🐲چی𝑮𝗕𝗴🍒ꜥⲣⲏ🐑⏰鉄リ事件ї💊「」\uf203\uf09a\uf222\ue608\uf202\uf099\uf469\ue607\uf410\ue600燻製シ虚偽屁理屈Г𝑩𝑰𝒀𝑺🌤𝗳𝗜𝗙𝗦𝗧🍊ὺἈἡχῖΛ⤏🇳𝒙ψՁմեռայինրւդձ冬至ὀ𝒁🔹🤚🍎𝑷🐂💅𝘬𝘱𝘸𝘷𝘐𝘭𝘓𝘖𝘹𝘲𝘫کΒώ💢ΜΟΝΑΕ🇱♲𝝈↴💒⊘Ȼ🚴🖕🖤🥘📍👈➕🚫🎨🌑🐻𝐎𝐍𝐊𝑭🤖🎎😼🕷ｇｒｎｔｉｄｕｆｂｋ𝟰🇴🇭🇻🇲𝗞𝗭𝗘𝗤👼📉🍟🍦🌈🔭《🐊🐍\uf10aლڡ🐦\U0001f92f\U0001f92a🐡💳ἱ🙇𝗸𝗟𝗠𝗷🥜さようなら🔼'
CONTRACTION_MAPPING = {"ain't": "is not", "aren't": "are not","can't": "cannot", "'cause": "because", "could've": "could have", "couldn't": "could not", "didn't": "did not",  "doesn't": "does not", "don't": "do not", "hadn't": "had not", "hasn't": "has not", "haven't": "have not", "he'd": "he would","he'll": "he will", "he's": "he is", "how'd": "how did", "how'd'y": "how do you", "how'll": "how will", "how's": "how is",  "I'd": "I would", "I'd've": "I would have", "I'll": "I will", "I'll've": "I will have","I'm": "I am", "I've": "I have", "i'd": "i would", "i'd've": "i would have", "i'll": "i will",  "i'll've": "i will have","i'm": "i am", "i've": "i have", "isn't": "is not", "it'd": "it would", "it'd've": "it would have", "it'll": "it will", "it'll've": "it will have","it's": "it is", "let's": "let us", "ma'am": "madam", "mayn't": "may not", "might've": "might have","mightn't": "might not","mightn't've": "might not have", "must've": "must have", "mustn't": "must not", "mustn't've": "must not have", "needn't": "need not", "needn't've": "need not have","o'clock": "of the clock", "oughtn't": "ought not", "oughtn't've": "ought not have", "shan't": "shall not", "sha'n't": "shall not", "shan't've": "shall not have", "she'd": "she would", "she'd've": "she would have", "she'll": "she will", "she'll've": "she will have", "she's": "she is", "should've": "should have", "shouldn't": "should not", "shouldn't've": "should not have", "so've": "so have","so's": "so as", "this's": "this is","that'd": "that would", "that'd've": "that would have", "that's": "that is", "there'd": "there would", "there'd've": "there would have", "there's": "there is", "here's": "here is","they'd": "they would", "they'd've": "they would have", "they'll": "they will", "they'll've": "they will have", "they're": "they are", "they've": "they have", "to've": "to have", "wasn't": "was not", "we'd": "we would", "we'd've": "we would have", "we'll": "we will", "we'll've": "we will have", "we're": "we are", "we've": "we have", "weren't": "were not", "what'll": "what will", "what'll've": "what will have", "what're": "what are",  "what's": "what is", "what've": "what have", "when's": "when is", "when've": "when have", "where'd": "where did", "where's": "where is", "where've": "where have", "who'll": "who will", "who'll've": "who will have", "who's": "who is", "who've": "who have", "why's": "why is", "why've": "why have", "will've": "will have", "won't": "will not", "won't've": "will not have", "would've": "would have", "wouldn't": "would not", "wouldn't've": "would not have", "y'all": "you all", "y'all'd": "you all would","y'all'd've": "you all would have","y'all're": "you all are","y'all've": "you all have","you'd": "you would", "you'd've": "you would have", "you'll": "you will", "you'll've": "you will have", "you're": "you are", "you've": "you have" }
CUSTOM_TABLE = str.maketrans(
    {
        "\xad": None,
        "\x7f": None,
        "\ufeff": None,
        "\u200b": None,
        "\u200e": None,
        "\u202a": None,
        "\u202c": None,
        "‘": "'",
        "’": "'",
        "`": "'",
        "“": '"',
        "”": '"',
        "«": '"',
        "»": '"',
        "ɢ": "G",
        "ɪ": "I",
        "ɴ": "N",
        "ʀ": "R",
        "ʏ": "Y",
        "ʙ": "B",
        "ʜ": "H",
        "ʟ": "L",
        "ғ": "F",
        "ᴀ": "A",
        "ᴄ": "C",
        "ᴅ": "D",
        "ᴇ": "E",
        "ᴊ": "J",
        "ᴋ": "K",
        "ᴍ": "M",
        "Μ": "M",
        "ᴏ": "O",
        "ᴘ": "P",
        "ᴛ": "T",
        "ᴜ": "U",
        "ᴡ": "W",
        "ᴠ": "V",
        "ĸ": "K",
        "в": "B",
        "м": "M",
        "н": "H",
        "т": "T",
        "ѕ": "S",
        "—": "-",
        "–": "-",
    }
)

NMS_TABLE = dict.fromkeys(
    i for i in range(sys.maxunicode + 1) if unicodedata.category(chr(i)) == "Mn"
)

HEBREW_TABLE = {i: "א" for i in range(0x0590, 0x05FF)}
ARABIC_TABLE = {i: "ا" for i in range(0x0600, 0x06FF)}
CHINESE_TABLE = {i: "是" for i in range(0x4E00, 0x9FFF)}
KANJI_TABLE = {i: "ッ" for i in range(0x2E80, 0x2FD5)}
HIRAGANA_TABLE = {i: "ッ" for i in range(0x3041, 0x3096)}
KATAKANA_TABLE = {i: "ッ" for i in range(0x30A0, 0x30FF)}

TABLE = dict()
TABLE.update(CUSTOM_TABLE)
TABLE.update(NMS_TABLE)

# Non-english languages
TABLE.update(CHINESE_TABLE)
TABLE.update(HEBREW_TABLE)
TABLE.update(ARABIC_TABLE)
TABLE.update(HIRAGANA_TABLE)
TABLE.update(KATAKANA_TABLE)
TABLE.update(KANJI_TABLE)


RE_SPACE = re.compile(r"\s")
RE_MULTI_SPACE = re.compile(r"\s+")
isolate_dict = {ord(c):f' {c} ' for c in symbols_to_isolate}
remove_dict = {ord(c):f'' for c in symbols_to_delete}

In [None]:
import emoji

EMOJI_REGEXP = emoji.get_emoji_regexp()

UNICODE_EMOJI = {
    k: f" EMJ {v.strip(':').replace('_', ' ')} "
    for k, v in emoji.UNICODE_EMOJI_ALIAS.items()
}


def remove_emoji(string: str):
    def replace(match):
        return UNICODE_EMOJI.get(match.group(0), match.group(0))

    return re.sub("\ufe0f", "", EMOJI_REGEXP.sub(replace, string))

In [None]:
def normalize(text: str):
    for k, v in CONTRACTION_MAPPING.items():
        text = text.replace(' %s ' % k, ' %s ' % v)
    text = text.translate(remove_dict)
    text = text.translate(isolate_dict)
    text = remove_emoji(text)
    text = RE_SPACE.sub(" ", text)
    text = unicodedata.normalize("NFKD", text)
    text = text.translate(TABLE)
    text = RE_MULTI_SPACE.sub(" ", text).strip()
    text = re.sub("@[a-zA-Z]*","USER",text)
    text = re.sub("#[a-zA-z]*","HASHTAG",text)
    return text

In [None]:
%%time
with multiprocessing.Pool(processes=4) as pool:
    org1_list = pool.map(normalize, org1.entry_translated.tolist())
    org2_list = pool.map(normalize, org2.entry_translated.tolist())
    org3_list = pool.map(normalize, org3.entry_translated.tolist())
    test1_list = pool.map(normalize, test1.entry_translated.tolist())
    test2_list = pool.map(normalize, test2.entry_translated.tolist())
    test3_list = pool.map(normalize, test3.entry_translated.tolist())

In [None]:
org1['text'] = org1_list
org2['text'] = org2_list
org3['text'] = org3_list
test1['text'] = test1_list
test2['text'] = test2_list
test3['text'] = test3_list

In [None]:
org123 = pd.concat([org1, org2, org3])
test123 = pd.concat([test1, test2, test3])
print(org123.shape)
print(test123.shape)

In [None]:
org123.to_csv('data/train123.csv',index=False)
test123.to_csv('data/test123.csv',index=False)

## Model

In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer

In [None]:
word_vectorizer = TfidfVectorizer(ngram_range=(1,2),
               min_df=5, max_df=0.9, strip_accents='unicode', use_idf=1,
               smooth_idf=1, sublinear_tf=1, max_features=50000,stop_words='english')

In [None]:
%%time
org1_tfidf = word_vectorizer.fit_transform(org1['text'])
org2_tfidf = word_vectorizer.transform(org2['text'])
org3_tfidf = word_vectorizer.transform(org3['text'])
test1_tfidf = word_vectorizer.transform(test1['text'])
test2_tfidf = word_vectorizer.transform(test2['text'])
test3_tfidf = word_vectorizer.transform(test3['text'])
org123_tfidf = word_vectorizer.fit_transform(org123['text'])
test123_tfidf = word_vectorizer.transform(test123['text'])

In [None]:
# naive bayes
def pr(y_i, y):
    p = x[y==y_i].sum(0)
    return (p+1) / ((y==y_i).sum()+1)

In [None]:
# fitting a model one dependent at a time
def get_model(y):
    y = y.values
    r = np.log(pr(1,y) / pr(0,y))
    m = LogisticRegression(C=4, dual=True, solver='liblinear')
    x_nb = x.multiply(r)
    return m.fit(x_nb, y), r


In [None]:
#test1 
test_x = test1_tfidf
x = org1_tfidf
train = org1
preds = np.zeros((len(test1), len(label_cols)))
print(preds.shape)

for i, j in enumerate(label_cols):
    print('fit', j)
    m,r = get_model(train[j])
    preds[:,i] = m.predict_proba(test_x.multiply(r))[:,1]


In [None]:
submid = pd.DataFrame({'id': test1["id"]})
submission1 = pd.concat([submid, pd.DataFrame(preds, columns = label_cols)], axis=1)
values=[]
for i in range(len(test1)): 
    values.append(np.argmax(preds[i]) + 1)
submission1['predicted_label'] = values
del values
gc.collect()
submission1.head()

In [None]:
# test 2
test_x = test2_tfidf
x = org2_tfidf
train = org2
preds = np.zeros((len(test2), len(label_cols)))
print(preds.shape)

for i, j in enumerate(label_cols):
    print('fit', j)
    m,r = get_model(train[j])
    preds[:,i] = m.predict_proba(test_x.multiply(r))[:,1]


In [None]:
submid = pd.DataFrame({'id': test2["id"]})
submission2 = pd.concat([submid, pd.DataFrame(preds, columns = label_cols)], axis=1)
values=[]
for i in range(len(test2)): 
    values.append(np.argmax(preds[i]) + 1)
submission2['predicted_label'] = values
del values
gc.collect()
submission2.head()

In [None]:
# test 3
test_x = test3_tfidf
x = org3_tfidf
train = org3
preds = np.zeros((len(test3), len(label_cols)))
print(preds.shape)

for i, j in enumerate(label_cols):
    print('fit', j)
    m,r = get_model(train[j])
    preds[:,i] = m.predict_proba(test_x.multiply(r))[:,1]


In [None]:
submid = pd.DataFrame({'id': test3["id"]})
submission3 = pd.concat([submid, pd.DataFrame(preds, columns = label_cols)], axis=1)
values=[]
for i in range(len(test3)): 
    values.append(np.argmax(preds[i]) + 1)
submission3['predicted_label'] = values
del values
gc.collect()
submission3.head()

In [None]:
print(submission1.shape)
print(submission2.shape)
print(submission3.shape)

In [None]:
submission123 = pd.concat([submission1, submission2, submission3])
print(submission123.shape)

In [None]:
submission123.head()

In [None]:
submission123.columns

In [None]:
# test 123 all combined as one test set
test_x = test123_tfidf
x = org123_tfidf
train = org123
preds = np.zeros((len(test123), len(label_cols)))
print(preds.shape)

for i, j in enumerate(label_cols):
    print('fit', j)
    m,r = get_model(train[j])
    preds[:,i] = m.predict_proba(test_x.multiply(r))[:,1]


In [None]:
submid = pd.DataFrame({'id': test123["id"]})
submid = submid.reset_index()
# print(sorted(submid.index))
submission123_combined = pd.concat([submid, pd.DataFrame(preds, columns = label_cols)], axis=1)
values=[]
for i in range(len(test123)): 
    values.append(np.argmax(preds[i]) + 1)
submission123_combined['predicted_label'] = values
del values
gc.collect()
submission123_combined.head()

## ORG4

In [None]:
org4 = pd.read_csv('data/org4_dev.csv')
test4 = pd.read_csv('data/org4_test.csv')

In [None]:
org4.head()

In [None]:
org4_cols = [str(i) for i in range(101,113)]
org4_cols

In [None]:
for i in range(101,113): 
    org4[str(i)] = org4['labels'].apply(lambda x: 1 if str(i) in x else 0)
    test4[str(i)] = 0

In [None]:
org4.head()

In [None]:
%%time
with multiprocessing.Pool(processes=4) as pool:
    org4_list = pool.map(normalize, org4.entry_translated.tolist())
    test4_list = pool.map(normalize, test4.entry_translated.tolist())

In [None]:
org4['text'] = org4_list
test4['text'] = test4_list

In [None]:
org4.to_csv('data/org4.csv',index=False)
test4.to_csv('data/test4.csv',index=False)

In [None]:
org4 = pd.read_csv('data/org4.csv')
test4 = pd.read_csv('data/test4.csv')

In [None]:
%%time
org4_tfidf = word_vectorizer.fit_transform(org4['text'])
test4_tfidf = word_vectorizer.transform(test4['text'])

In [None]:
org4['107'].unique()

Hence 107 doesn't even come in the test set once. So we can effectively ignore it.

In [None]:
#test1 
test_x = test4_tfidf
x = org4_tfidf
train = org4
preds = np.zeros((len(test4), len(org4_cols)))
print(preds.shape)

for i, j in enumerate(org4_cols):
    print('fit', j)
    if j == '107': 
        preds[:,i] = 0
        continue
    m,r = get_model(train[j])
    preds[:,i] = m.predict_proba(test_x.multiply(r))[:,1]


In [None]:
submid = pd.DataFrame({'id': test4["id"]})
submission4 = pd.concat([submid, pd.DataFrame(preds, columns = org4_cols)], axis=1)
values=[]
for i in range(len(test4)): 
    values.append(np.argmax(preds[i]) + 101)
submission4['predicted_label'] = values
del values
gc.collect()
submission4.head()

In [None]:
submission123_needed = submission123.drop(label_cols, axis=1)
submission4_needed = submission4.drop(org4_cols, axis=1)
submission1234 = pd.concat([submission123_needed, submission4_needed])
print(submission1234.shape)

In [None]:
submission123_combined_needed = submission123_combined.drop(label_cols, axis=1)
submission4_needed = submission4.drop(org4_cols, axis=1)
submission1234_combined = pd.concat([submission123_combined_needed, submission4_needed])
print(submission1234_combined.shape)

In [None]:
submission1234_combined = submission1234_combined.drop('index',axis=1)

In [None]:
submission1234.tail()

In [None]:
submission1234.to_csv('data/sample_submission.csv',index=False)

In [None]:
submission1234_combined.to_csv('data/sample_submission_combined.csv',index=False)

## Normal Logistic Regression 

In [None]:
import pandas as pd 
import numpy as np

In [None]:
train123 = pd.read_csv("data/train123.csv")
test123 = pd.read_csv("data/test123.csv")
train123.head()

In [None]:
train123['label_one'] = train123['labels'].apply(lambda x: x[0])
train123.head()

In [None]:
train123['label_one'].nunique()

In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
word_vectorizer = TfidfVectorizer(ngram_range=(1,2),
               min_df=5, max_df=0.9, strip_accents='unicode', use_idf=1,
               smooth_idf=1, sublinear_tf=1, max_features=50000,stop_words='english')

In [None]:
%%time
train_tfidf = word_vectorizer.fit_transform(train123['text'])
test_tfidf = word_vectorizer.transform(test123['text'])

In [None]:
y_train = train123['label_one']

In [None]:
from sklearn.model_selection import StratifiedKFold, KFold, GridSearchCV
import pickle

penalty = ['l1', 'l2']

# Create regularization hyperparameter space
C = np.logspace(0, 4, 10)

hyperparameters = dict(C=C, penalty=penalty)
lr = GridSearchCV(LogisticRegression(random_state=23), hyperparameters, cv=5, n_jobs=-1, verbose=0)
print("[INFO]Training..")
lr.fit(train_tfidf, y_train)

print("[INFO]Saving...")
with open('logisticRegression_model.pickle', 'wb') as file:
    pickle.dump(lr, file, protocol=pickle.HIGHEST_PROTOCOL)

In [None]:
lr.predict(test_tfidf[0])

In [None]:
test123['model_lr'] = lr.predict(test_tfidf)

In [None]:
submission4.head()

In [None]:
submission = pd.DataFrame({'id': test123["id"]})
submission['predicted_label'] = test123['model_lr']
submission.head()

In [None]:
submission4_needed = submission4.drop(org4_cols, axis=1)
submission_all = pd.concat([submission, submission4_needed])
print(submission_all.shape)

In [None]:
submission_all.to_csv('data/simple_LR.csv',index=False)

In [None]:
submission4_needed.to_csv('data/submission4.csv',index=False)