In [1]:
import numpy as np 
import pandas as pd 
import warnings
warnings.filterwarnings('ignore')

## 1. Import Data

In [2]:
file = '../input/bt5153-applied-machine-learning-2021-spring/train.csv'
file_test = '../input/bt5153-applied-machine-learning-2021-spring/test.csv'
new_15 = '../input/bn-vect-manual-out95/test_over95.csv'

df = pd.read_csv(file)
df_test = pd.read_csv(file_test)
df_15_new = pd.read_csv(new_15)
df_15_new = df_15_new[['Outcome', 'Text', 'Id']]

DEBUG = False
if DEBUG:
    df = df[:1000]
    df_15_new = df_15_new[:1000]
    df_test = df_test[:1000]

df_15_new.head()

Unnamed: 0,Outcome,Text,Id
0,15,Food charities struggle to cover meals lost fr...,955456
1,15,President Said to Be Planning to Use Executive...,955458
2,15,Tracking deregulation in the Trump era,955460
3,15,Former Montana governor hedging on presidentia...,955461
4,15,“The number of ‘compliance incidents’ is jaw-d...,955465


## 2. Data Cleaning

In [3]:
#Text Cleaning
import re
from nltk.corpus import stopwords
REPLACE_BY_SPACE_RE = re.compile('[/(){}\[\]\|@,;#]')
BAD_SYMBOLS_RE = re.compile('[^0-9a-z #+_]')
STOPWORDS = set(stopwords.words('english'))

my_stopwords = {'gt', 'lt', 'xa', 'xd', 'im'}
for word in my_stopwords:
    stopwords = STOPWORDS.add(word)

def clean_text(text):
    """
    argument: text - a string       
    return: text - modified initial string
    """
    #text = BeautifulSoup(text, "lxml").text # HTML decoding
    text = text.lower() # lowercase text
    text = REPLACE_BY_SPACE_RE.sub(' ', text) # replace REPLACE_BY_SPACE_RE symbols by space in text
    text = BAD_SYMBOLS_RE.sub('', text) # delete symbols which are in BAD_SYMBOLS_RE from text
    text = ' '.join(word for word in text.split() if word not in STOPWORDS) # delete stopwors from text
    return text
    
df['Text'] = df['Text'].apply(clean_text)

In [4]:
df['Split'] = df['Text'].apply(lambda x: x.split())
df_15_new['Split'] = df_15_new['Text'].apply(lambda x: x.split())
df_test['Split'] = df_test['Text'].apply(lambda x: x.split())

# 3. Features based on Word List

In [5]:
import string
def count_words(text, word_list):
    # count the words that are in word_list for the text
    cnt = 0
    for word in text:
        if word.strip(string.punctuation).lower() in word_list:
            cnt+=1
    return cnt

In [6]:
lda_list = {0 : 'php post page wordpress echo code posts id plugin plugins development function functions', 
            1 : 'visa visas passport passports air travel flight airport airports visit uk us schengen customs immigration transit train trains luggage ticket tickets', 
            2 : 'mac iphone macbook os apple macos ios pro itunes ipad icloud applescript safari imac', 
            3 : 'android device google google-play galaxy samsung screen', 
            4 : 'space spacex spacecraft earth moon mars orbit launch rocket rockets spacecraft satellite nasa apollo',
            5 : 'voltage circuit power power-supply current microcontroller transistor transistors battery batteries amplifier resistor resistors',
            6 : 'money price rate rates value demand tax taxes stock stocks income incomes invest investing credit credit-card mortgage loan loans bank banking trade trading real-estate',
            7 : 'aliens anime book books magic marvel novel read remember story stories ',
            8 : 'block blocks database drupal field module node nodes form forms content node nodes file files user users', 
            9 : 'achievement achievements dota game play pc ps3 ps4 pokemon pokemon-go level xbox',
            10 : 'cast casting character cinema dialogue ending movie plot production scene title film films episode',
            11 : 'bitcoin wallet transaction address block btc bitcoins transactions mining blockchain',
            12 : 'dynamic dynamics electro energy fluid force forces gravity magnetism mass mechnics optics partial quantum light theory wave waves',
            13 : 'arcmap arcpy arcgis geoserver gdal openlayers openstreetmap polygon qgis postgresql postgis pyqgis layer map raster', 
            14 : 'amp calculus derivatives differential frac geometry inequality integration linear-algebra mathbb matrices polynomials probability prove infty function sqrt problem set statistics',
            15 : 'republican bill senate republicans states country government president question countries democrats state vote law obama political us house trumps romney court trump gop party'
           }

In [7]:
def get_lda_features(df):
    # compute LDA_based features
    for i in range(16):
        word_list = lda_list[i].split()
        col = 'lda_'+str(i)
        df[col] = df.Split.apply(lambda x: count_words(x, word_list))

In [8]:
%%time
get_lda_features(df)
get_lda_features(df_test)
get_lda_features(df_15_new)

CPU times: user 6min 16s, sys: 357 ms, total: 6min 17s
Wall time: 6min 17s


In [9]:
df.head()

Unnamed: 0,Outcome,Text,Id,Split,lda_0,lda_1,lda_2,lda_3,lda_4,lda_5,lda_6,lda_7,lda_8,lda_9,lda_10,lda_11,lda_12,lda_13,lda_14,lda_15
0,14,problem first example amir dembo ofer zeitouni...,1,"[problem, first, example, amir, dembo, ofer, z...",0,0,0,0,0,0,0,1,0,0,0,0,0,0,2,0
1,14,everyone met tough definite integral follows i...,2,"[everyone, met, tough, definite, integral, fol...",0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0
2,7,please dont lynch ive never sat entire star wa...,3,"[please, dont, lynch, ive, never, sat, entire,...",0,0,0,0,0,0,0,0,0,0,4,0,0,0,0,0
3,14,calculate mathbb z x langle2x1rangle,4,"[calculate, mathbb, z, x, langle2x1rangle]",0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0
4,2,somebody rings texts iphone also showing ipad ...,5,"[somebody, rings, texts, iphone, also, showing...",0,0,3,0,0,0,0,0,0,0,0,0,0,0,0,0


In [10]:
df.to_csv('df_lda_feat.csv')
df_15_new.to_csv('df_15_new_lda_feat.csv')
df_test.to_csv('df_test_lda_feat.csv')