In [1]:
import pandas as pd
import numpy as np
from multiprocessing import Pool

In [3]:
%%time
data=pd.DataFrame()
data_input=pd.read_csv('data/train-sample_October_9_2012_v2.csv')

CPU times: user 2 s, sys: 156 ms, total: 2.16 s
Wall time: 3.33 s


In [4]:
num_partitions = 12 #number of partitions to split dataframe
num_cores = 12 #number of cores on your machine

In [5]:
def parallelize_dataframe(df, func):
    df_split = np.array_split(df, num_partitions)
    pool = Pool(num_cores)
    df = np.concatenate(pool.map(func, df_split))
    pool.close()
    pool.join()
    return df

In [6]:
def convert_string_to_date(data):
    return pd.to_datetime(data)    

In [7]:
%%time
post_val=data_input['PostCreationDate'].values
post_creation_date = parallelize_dataframe(post_val, convert_string_to_date)

CPU times: user 44 ms, sys: 68 ms, total: 112 ms
Wall time: 5.19 s


In [8]:
%%time
owner_val=data_input['OwnerCreationDate'].values
owner_creation_date = parallelize_dataframe(owner_val, convert_string_to_date)

CPU times: user 44 ms, sys: 68 ms, total: 112 ms
Wall time: 5.59 s


In [9]:
data_input.drop('PostCreationDate',axis=1,inplace=True)

In [10]:
data_input.drop('OwnerCreationDate',axis=1,inplace=True)

In [11]:
owner_undeleted_questions=data_input['OwnerUndeletedAnswerCountAtPostTime'].values

In [12]:
reputation_at_creation=data_input['ReputationAtPostCreation'].values

In [13]:
data_input.drop('OwnerUndeletedAnswerCountAtPostTime',axis=1,inplace=True)
data_input.drop('ReputationAtPostCreation',axis=1,inplace=True)

In [14]:
data_input.shape

(178352, 11)

In [15]:
def bod_len(value):
    if type(value) is str:
        return len(value)
    else:
        return 0

In [16]:
%%time
body_len=data_input["BodyMarkdown"].apply(bod_len)

CPU times: user 120 ms, sys: 8 ms, total: 128 ms
Wall time: 126 ms


In [17]:
def is_code_supplied(body_text):
    if type(body_text) is not str:
        return 0

    for line in body_text.split('\n'):
        if line.startswith('    '):
            return 1
    return 0       

In [18]:
%%time
cod_exists=data_input["BodyMarkdown"].apply(is_code_supplied)

CPU times: user 696 ms, sys: 0 ns, total: 696 ms
Wall time: 693 ms


In [19]:
def number_of_code_blocks(body_text):
    if type(body_text) is not str:
        return 0

    in_code_block = False
    code_blocks = 0
    for line in body_text.split('\n'):
        if line.strip() == '':
            continue
        if in_code_block:
            if line.startswith('    '):
                continue
            else:
                in_code_block = False
        else:
            if line.startswith('    '):
                in_code_block = True
                code_blocks += 1
            else:
                continue
    return code_blocks

In [20]:
%%time
num_code_blocks=data_input["BodyMarkdown"].apply(number_of_code_blocks)

CPU times: user 1.5 s, sys: 0 ns, total: 1.5 s
Wall time: 1.5 s


In [21]:
def no_of_lines(body_text):
    if type(body_text) is not str:
        return 0

    lines_of_code = 0
    for line in body_text.split('\n'):
        if line.startswith('    '):
            lines_of_code += 1
    return lines_of_code
def no_of_words(body_text):
    if type(body_text) is not str:
        return 0
        
    words = 0        
    for line in body_text.split('\n'):
        if line.startswith('    '):
            continue
        words += len(line.split(' '))
    return words

def code_ratio(body_text):
    if type(body_text) is not str:
        return 0
    
    lines_of_code = no_of_lines(body_text)
    words = no_of_words(body_text)
    return lines_of_code / (lines_of_code + (words / 7.0))

In [22]:
%%time
code_body=data_input["BodyMarkdown"].apply(code_ratio)

CPU times: user 3.96 s, sys: 12 ms, total: 3.97 s
Wall time: 3.97 s


In [23]:
def homework_in_title(row):
    if 'homework' in row:
        return 1
    else :
        return 0

In [24]:
%%time
homework_title=data_input['Title'].apply(homework_in_title)

CPU times: user 80 ms, sys: 0 ns, total: 80 ms
Wall time: 77 ms


In [25]:
def homework_in_body(bodytext):
    if 'homework' in bodytext:
        return 1
    else :
        return 0

In [26]:
%%time
homework_body=data_input['BodyMarkdown'].apply(homework_in_body)

CPU times: user 160 ms, sys: 0 ns, total: 160 ms
Wall time: 162 ms


In [27]:
data_input.drop('BodyMarkdown',axis=1,inplace=True)

In [28]:
data_input.drop('Title',axis=1,inplace=True)

In [29]:
data_input.head()

Unnamed: 0,PostId,OwnerUserId,Tag1,Tag2,Tag3,Tag4,Tag5,PostClosedDate,OpenStatus
0,10035653,1159226,c++,,,,,04/05/2012 23:31:34,too localized
1,8922537,1157921,php,xml,cakephp,zip,,01/19/2012 16:43:00,not a real question
2,5962216,696219,iphone-sdk-4.0,,,,,,open
3,10070625,490895,linux,module,kernel,,,,open
4,8960935,1017103,mysql,limit,,,,,open


In [30]:
status_bit = {
    "open": 3,
    "not a real question": 0,
    "off topic": 2,
    "not constructive": 1,
    "too localized": 4
}

In [31]:
def set_label(row):
    return status_bit[row]

In [32]:
%%time
set_label=data_input['OpenStatus'].apply(set_label)

CPU times: user 76 ms, sys: 0 ns, total: 76 ms
Wall time: 74 ms


In [33]:
data_input.drop('OpenStatus',axis=1,inplace=True)

In [34]:
def get_num_tags(row):
    res_ar=[]
    for x in row :
        temp_count=0
        for y in x:
            if isinstance(y,type('str')):
                temp_count=temp_count+1
        res_ar.append(temp_count)
    return res_ar

In [35]:
data_num_tags_temp=data_input[['Tag1','Tag2','Tag3','Tag4','Tag5']].values

In [36]:
%%time
data_num_tags=parallelize_dataframe(data_num_tags_temp,get_num_tags)

CPU times: user 100 ms, sys: 72 ms, total: 172 ms
Wall time: 281 ms


In [37]:
def get_hw_tags_fun(row):
    res_ar=[]
    for x in row :
        temp_count=0
        flag=0
        for y in x:
            if isinstance(y,type('str')):
                if y=='homework':
                    flag=1
        res_ar.append(flag)
    return res_ar

In [38]:
%%time
data_hw_tags=parallelize_dataframe(data_num_tags_temp,get_hw_tags_fun)

CPU times: user 108 ms, sys: 64 ms, total: 172 ms
Wall time: 279 ms


In [39]:
data_input=[]

In [40]:
%%time
day=[]
month=[]
year=[]
for date_temp in post_creation_date:
    date_1=pd.to_datetime(date_temp)
    day.append(date_1.weekday())
    month.append(date_1.month)
    year.append(date_1.year)

CPU times: user 6.8 s, sys: 36 ms, total: 6.84 s
Wall time: 6.83 s


In [41]:
%%time
age=[]
for f, b in zip(post_creation_date, owner_creation_date):
    age.append((f-b)/np.timedelta64(1, 's'))

CPU times: user 792 ms, sys: 0 ns, total: 792 ms
Wall time: 793 ms


In [42]:
final=pd.DataFrame()

In [43]:
final['age']=age

In [44]:
final['day']=day

In [45]:
final['month']=month
final['year']=year
final['ownundel']=owner_undeleted_questions
final['ownrep']=reputation_at_creation
final['bodylen']=body_len
final['codeexists']=cod_exists
final['num_code']=num_code_blocks
final['coderatio']=code_body
final['hwtitle']=homework_title
final['hwbody']=homework_body
final['label']=set_label
final['numtags']=data_num_tags
final['hwtags']=data_hw_tags

In [46]:
final.head()

Unnamed: 0,age,day,month,year,ownundel,ownrep,bodylen,codeexists,num_code,coderatio,hwtitle,hwbody,label,numtags,hwtags
0,6659479.0,3,4,2012,0,1,1276,1,1,0.462264,0,0,4,1,0
1,413.0,3,1,2012,0,1,352,0,0,0.0,0,0,0,4,0
2,2948660.0,2,5,2011,2,40,500,0,0,0.0,0,0,3,1,0
3,45631059.0,0,4,2012,1,1,371,0,0,0.0,0,0,3,3,0
4,7494233.0,6,1,2012,0,28,169,1,1,0.25,0,0,3,2,0


In [47]:
final.to_csv('large_small.csv')