In [1]:
from google.colab import drive
drive.mount("/content/drive")

Mounted at /content/drive


In [2]:
!pip install category_encoders

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting category_encoders
  Downloading category_encoders-2.5.0-py2.py3-none-any.whl (69 kB)
[K     |████████████████████████████████| 69 kB 4.5 MB/s 
Installing collected packages: category-encoders
Successfully installed category-encoders-2.5.0


In [3]:
import os
import pandas as pd
import numpy as np
import re
import string

from sklearn.preprocessing import LabelEncoder
import category_encoders as ce

In [4]:
DIR = '/content/drive/MyDrive/Competitions/Signate/MUFJ'
INPUT_DIR = os.path.join(DIR,'input')
OUTPUT_DIR = os.path.join(DIR,'output')

In [5]:
train_df = pd.read_csv(os.path.join(INPUT_DIR,'train.csv'))
test_df = pd.read_csv(os.path.join(INPUT_DIR,'test.csv'))

In [6]:
display(train_df.head(3))
print(train_df.shape)
display(test_df.head(3))
print(test_df.shape)

Unnamed: 0,id,goal,country,duration,category1,category2,html_content,state
0,train_00000,20001-21000,US,45,art,mixed media,"<div class=""contents""><div><p><a href=""http://...",1
1,train_00001,19001-20000,US,59,food,restaurants,"<div class=""contents""><div><p>Cultural Pretzel...",0
2,train_00002,2001-3000,US,38,art,performance art,"<div class=""contents""><div><p>I want to perfor...",0


(9791, 8)


Unnamed: 0,id,goal,country,duration,category1,category2,html_content
0,test_00000,5001-6000,FR,30,dance,performances,"<div class=""contents""><div><p>Bonjour ,</p><p>..."
1,test_00001,6001-7000,GB,23,publishing,children's books,"<div class=""contents""><div><p><span class=""bol..."
2,test_00002,6001-7000,GB,30,theater,plays,"<div class=""contents""><div><p>COW is a rural t..."


(9800, 7)


In [7]:
def remove_URL(text):
    url = re.compile(r'https?://\S+|www\.\S+')
    return url.sub('',text)


def remove_html(text):
    html=re.compile(r"<[^>]*?>")
    return html.sub('',text)

def cleaning(texts):
    clean_texts = []
    for text in texts:
        # htmlタグを削除
        text = remove_URL(text)
        text = remove_html(text)
        #アルファベット以外をスペースに置き換え
        #clean_punc = re.sub(r'[^a-zA-Z]', ' ', text)
        #改行削除
        #text = text.replace("\n","")
        clean_texts.append(text)
    return clean_texts

In [8]:
def get_goal_values(df):
  df["goal"].replace("100000+","100000-100000",inplace=True)
  _df = df["goal"].str.split('-').apply(pd.Series).astype(float)
  _df.columns = ["goal_max","goal_min"]
  df["goal"] = _df.median(axis=1)
  df["goal"] = df["goal"].astype(int)
  return df

In [9]:
train_len = train_df.shape[0]

concat_df = pd.concat([train_df,test_df],axis=0)
concat_df = concat_df.reset_index(drop=True)

concat_df['content'] = cleaning(concat_df['html_content'])

concat_df = get_goal_values(concat_df)

In [10]:
concat_df

Unnamed: 0,id,goal,country,duration,category1,category2,html_content,state,content
0,train_00000,20500,US,45,art,mixed media,"<div class=""contents""><div><p><a href=""http://...",1.0,"The Shillito's Elves attracted close to 100,00..."
1,train_00001,19500,US,59,food,restaurants,"<div class=""contents""><div><p>Cultural Pretzel...",0.0,Cultural Pretzel Sports Bar is a place where p...
2,train_00002,2500,US,38,art,performance art,"<div class=""contents""><div><p>I want to perfor...",0.0,"I want to perform this piece guerilla style, o..."
3,train_00003,1500,US,30,art,mixed media,"<div class=""contents""><div><div class=""templat...",1.0,"\n\n\n\n\n\nCanyon de Chelley, Dine' (Navajo) ..."
4,train_00004,1500,US,29,film & video,webseries,"<div class=""contents""><div><p>The story of the...",1.0,"The story of the show, both on and off screen,..."
...,...,...,...,...,...,...,...,...,...
19586,test_09795,4500,US,29,music,world music,"<div class=""contents""><div><h1 class=""page-anc...",,How Tibetana Started\nIt all began after the t...
19587,test_09796,10500,US,30,publishing,children's books,"<div class=""contents""><div><p><span class=""bol...",,The Wild Waves Whist is a board book that take...
19588,test_09797,2500,US,30,music,hip-hop,"<div class=""contents""><div><h1 class=""page-anc...",,
19589,test_09798,7500,US,30,theater,plays,"<div class=""contents""><div><p>Have you ever re...",,Have you ever read a book or seen a movie and ...


In [11]:
def get_num_feature(input_df):
  output_df = pd.DataFrame()
  output_df["ratio_goal_duration"] = input_df["goal"] / input_df["duration"]
  output_df["prod_goal_duration"] = input_df["goal"] * input_df["duration"]
  return output_df

In [12]:
def get_catg_feature(input_df):
  output_df = pd.DataFrame()
  output_df["category3"] = input_df["category1"] + input_df["category2"]
  output_df["country_category1"] = input_df["country"] + input_df["category1"]
  output_df["country_category2"] = input_df["country"] + input_df["category2"]
  output_df["country_category3"] = input_df["country"] + output_df["category3"]
  return output_df

In [13]:
def get_ce_feature(input_df):
  _input_df = pd.concat([input_df,
                         get_catg_feature(input_df)]
                        ,axis=1)
  output_df = pd.DataFrame()
  cols = [
      "country",
      "category1",
      "category2",
      "category3",
      "country_category1",
      "country_category2",
      "country_category3"
  ]
  encoder = ce.CountEncoder()
  output_df = encoder.fit_transform(_input_df[cols]).add_prefix("CE_")
  return output_df

In [14]:
def get_le_feature(input_df):
  _input_df = pd.concat([input_df,
                         get_catg_feature(input_df)]
                        ,axis=1)
  output_df = pd.DataFrame()
  cols = [
      "country",
      "category1",
      "category2"
  ]
  for col in cols:
    encoder =  LabelEncoder()
    output_df[col+"_LE"] = encoder.fit_transform(_input_df[col])
  return output_df

In [15]:
def get_te_features(input_df):
  _input_df =  pd.concat([
        input_df,
        get_catg_feature(input_df)
    ], axis=1)
  output_df = pd.DataFrame()
  cols = [
      "country",
      "category1",
      "category2",
      "category3",
      "country_category1",
      "country_category2",
      "country_category3"
  ]
  for col in cols:
    target_dict = _input_df[[col,"state"]].groupby(col)["state"].mean().to_dict()
    encoded = _input_df[col].map(lambda x: target_dict[x]).values
    output_df[f"{col}_success_rate"] = encoded
  return output_df

In [16]:
def text_feature(input_df):
    def total_sentence(x):
        x = x.replace("!", "[end]").replace("?", "[end]").replace(".", "[end]")
        return len(x.split("[end]"))

    input_df['content'] = input_df['content'].astype(str).fillna('missing')
    _df = pd.DataFrame()
    _df['num_chars'] = input_df['content'].apply(len)
    _df['num_words'] = input_df['content'].apply(lambda x: len(x.split())+1)
    _df['num_sentence'] = input_df['content'].apply(lambda x: total_sentence(x))
    _df['num_unique_words'] = input_df['content'].apply(lambda x: len(set(w for w in x.split())))
    _df['words_vs_unique'] = _df['num_unique_words'] / _df['num_words']
    _df['words_vs_chars'] = _df['num_words'] / _df['num_chars']
    _df['chars_vs_sentence'] = _df['num_sentence'] / _df['num_chars']
    _df['words_vs_sentence'] = _df['num_sentence'] / _df['num_words']
    return _df

In [17]:
def get_html_info(input_df):
    input_df = pd.concat([input_df,text_feature(input_df)],axis=1)
    output_df = pd.DataFrame()
    output_df['num_figure'] = input_df['html_content'].apply(lambda x: x.count('<figure>'))
    output_df['num_video'] = input_df['html_content'].apply(lambda x: x.count('<video>'))
    output_df['num_figure+video'] = output_df['num_figure'] + output_df['num_video']
    output_df['num_figure+video_vs_words'] = output_df['num_figure+video'] / input_df['num_words']
    output_df['count_sent'] = input_df["html_content"].apply(lambda x: len(re.findall("\n",str(x)))+1)

    return output_df

In [18]:
def get_remove_html_info(input_df):
  input_df = pd.concat([input_df,text_feature(input_df)],axis=1)
  output_df = pd.DataFrame()
  output_df['num_exclamation_marks'] = input_df['content'].apply(lambda x: x.count('!'))
  output_df['num_question_marks'] = input_df['content'].apply(lambda x: x.count('?'))
  output_df['num_punctuation'] = input_df['content'].apply(lambda x: len([c for c in str(x) if c in string.punctuation]))
  output_df["num_words_upper"] = input_df["content"].apply(lambda x: len([w for w in str(x).split() if w.isupper()]))
  output_df["num_words_title"] = input_df["content"].apply(lambda x: len([w for w in str(x).split() if w.istitle()]))
  #output_df["num_stopwords"] = input_df["content"].apply(lambda x: len([w for w in str(x).lower().split() if w in stopwords]))
  output_df['punct_percent'] = output_df['num_punctuation'] / input_df['num_words']
  output_df["mean_word_len"] = input_df["content"].apply(lambda x: np.mean([len(w) for w in str(x).split()]))
  return output_df

In [19]:
feature_df = pd.concat([
    concat_df,
    get_catg_feature(concat_df),
    get_num_feature(concat_df),
    get_ce_feature(concat_df),
    get_le_feature(concat_df),
    get_te_features(concat_df),
    text_feature(concat_df),
    get_html_info(concat_df),
    get_remove_html_info(concat_df)
    ],axis=1)

  out=out, **kwargs)


In [20]:
feature_df

Unnamed: 0,id,goal,country,duration,category1,category2,html_content,state,content,category3,...,num_figure+video,num_figure+video_vs_words,count_sent,num_exclamation_marks,num_question_marks,num_punctuation,num_words_upper,num_words_title,punct_percent,mean_word_len
0,train_00000,20500,US,45,art,mixed media,"<div class=""contents""><div><p><a href=""http://...",1.0,"The Shillito's Elves attracted close to 100,00...",artmixed media,...,2,0.011696,32,0,0,20,0,20,0.116959,4.552941
1,train_00001,19500,US,59,food,restaurants,"<div class=""contents""><div><p>Cultural Pretzel...",0.0,Cultural Pretzel Sports Bar is a place where p...,foodrestaurants,...,2,0.012048,11,1,0,28,9,57,0.168675,5.327273
2,train_00002,2500,US,38,art,performance art,"<div class=""contents""><div><p>I want to perfor...",0.0,"I want to perform this piece guerilla style, o...",artperformance art,...,0,0.000000,12,0,0,56,20,36,0.150943,4.694595
3,train_00003,1500,US,30,art,mixed media,"<div class=""contents""><div><div class=""templat...",1.0,"\n\n\n\n\n\nCanyon de Chelley, Dine' (Navajo) ...",artmixed media,...,11,0.041199,62,0,0,90,10,33,0.337079,5.180451
4,train_00004,1500,US,29,film & video,webseries,"<div class=""contents""><div><p>The story of the...",1.0,"The story of the show, both on and off screen,...",film & videowebseries,...,11,0.044355,66,5,0,59,1,46,0.237903,5.129555
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
19586,test_09795,4500,US,29,music,world music,"<div class=""contents""><div><h1 class=""page-anc...",,How Tibetana Started\nIt all began after the t...,musicworld music,...,5,0.011905,248,0,0,52,4,79,0.123810,4.727924
19587,test_09796,10500,US,30,publishing,children's books,"<div class=""contents""><div><p><span class=""bol...",,The Wild Waves Whist is a board book that take...,publishingchildren's books,...,4,0.003373,397,2,6,180,3,170,0.151771,5.013502
19588,test_09797,2500,US,30,music,hip-hop,"<div class=""contents""><div><h1 class=""page-anc...",,,musichip-hop,...,0,0.000000,1,0,0,0,0,0,0.000000,
19589,test_09798,7500,US,30,theater,plays,"<div class=""contents""><div><p>Have you ever re...",,Have you ever read a book or seen a movie and ...,theaterplays,...,0,0.000000,12,1,2,40,2,48,0.096154,4.640964


In [21]:
def get_agg_func(input_df,col):
  _input_df =  pd.concat([
        input_df,
        get_catg_feature(input_df),
        get_num_feature(input_df)
    ], axis=1)
  output_df = _input_df.groupby(col).agg(
            goal_min=pd.NamedAgg(column="goal", aggfunc="min"),
            goal_max=pd.NamedAgg(column="goal", aggfunc="max"),
            goal_mean=pd.NamedAgg(column="goal", aggfunc="mean"),
            goal_std=pd.NamedAgg(column="goal", aggfunc="std"),
            duration_min=pd.NamedAgg(column="duration", aggfunc="min"),
            duration_max=pd.NamedAgg(column="duration", aggfunc="max"),
            duration_mean=pd.NamedAgg(column="duration", aggfunc="mean"),
            duration_std=pd.NamedAgg(column="duration", aggfunc="std"),
            ratio_goal_durataion_min=pd.NamedAgg(column="ratio_goal_duration", aggfunc="min"),
            ratio_goal_durataion_max=pd.NamedAgg(column="ratio_goal_duration", aggfunc="max"),
            ratio_goal_durataion_mean=pd.NamedAgg(column="ratio_goal_duration", aggfunc="mean"),
            ratio_goal_durataion_std=pd.NamedAgg(column="ratio_goal_duration", aggfunc="std"),
            prod_goal_durataion_min=pd.NamedAgg(column="prod_goal_duration", aggfunc="min"),
            prod_goal_durataion_max=pd.NamedAgg(column="prod_goal_duration", aggfunc="max"),
            prod_goal_durataion_mean=pd.NamedAgg(column="prod_goal_duration", aggfunc="mean"),
            prod_goal_durataion_std=pd.NamedAgg(column="prod_goal_duration", aggfunc="std")
            ).add_prefix(f'{col}_')
  return output_df

country_agg =  get_agg_func(concat_df,"country")
category1_agg =  get_agg_func(concat_df,"category1")
category2_agg =  get_agg_func(concat_df,"category2")
category3_agg =  get_agg_func(concat_df,"category3")
country_category_agg = get_agg_func(concat_df,"country_category1")

In [22]:
feature_df = feature_df.merge(country_agg,how="left",on="country").merge(category1_agg,how="left",on="category1").merge(category2_agg,how="left",on="category2").merge(category3_agg,how="left",on="category3").merge(country_category_agg,how="left",on="country_category1")
feature_df

Unnamed: 0,id,goal,country,duration,category1,category2,html_content,state,content,category3,...,country_category1_duration_mean,country_category1_duration_std,country_category1_ratio_goal_durataion_min,country_category1_ratio_goal_durataion_max,country_category1_ratio_goal_durataion_mean,country_category1_ratio_goal_durataion_std,country_category1_prod_goal_durataion_min,country_category1_prod_goal_durataion_max,country_category1_prod_goal_durataion_mean,country_category1_prod_goal_durataion_std
0,train_00000,20500,US,45,art,mixed media,"<div class=""contents""><div><p><a href=""http://...",1.0,"The Shillito's Elves attracted close to 100,00...",artmixed media,...,31.600760,12.620717,6.250000,12500.000000,244.482430,578.729051,500,6000000,249136.882129,5.500174e+05
1,train_00001,19500,US,59,food,restaurants,"<div class=""contents""><div><p>Cultural Pretzel...",0.0,Cultural Pretzel Sports Bar is a place where p...,foodrestaurants,...,33.732731,11.964994,8.333333,5882.352941,618.340636,764.186355,1500,6000000,736182.698515,1.047290e+06
2,train_00002,2500,US,38,art,performance art,"<div class=""contents""><div><p>I want to perfor...",0.0,"I want to perform this piece guerilla style, o...",artperformance art,...,31.600760,12.620717,6.250000,12500.000000,244.482430,578.729051,500,6000000,249136.882129,5.500174e+05
3,train_00003,1500,US,30,art,mixed media,"<div class=""contents""><div><div class=""templat...",1.0,"\n\n\n\n\n\nCanyon de Chelley, Dine' (Navajo) ...",artmixed media,...,31.600760,12.620717,6.250000,12500.000000,244.482430,578.729051,500,6000000,249136.882129,5.500174e+05
4,train_00004,1500,US,29,film & video,webseries,"<div class=""contents""><div><p>The story of the...",1.0,"The story of the show, both on and off screen,...",film & videowebseries,...,32.663082,11.797912,8.333333,20000.000000,515.681657,899.536571,1000,6000000,594444.188428,1.054557e+06
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
19586,test_09795,4500,US,29,music,world music,"<div class=""contents""><div><h1 class=""page-anc...",,How Tibetana Started\nIt all began after the t...,musicworld music,...,33.058205,12.137533,7.936508,11111.111111,240.053860,531.906690,1500,6000000,270520.412288,6.344851e+05
19587,test_09796,10500,US,30,publishing,children's books,"<div class=""contents""><div><p><span class=""bol...",,The Wild Waves Whist is a board book that take...,publishingchildren's books,...,31.835979,10.688245,8.333333,6666.666667,232.389737,446.146770,3000,6000000,234223.356009,5.066511e+05
19588,test_09797,2500,US,30,music,hip-hop,"<div class=""contents""><div><h1 class=""page-anc...",,,musichip-hop,...,33.058205,12.137533,7.936508,11111.111111,240.053860,531.906690,1500,6000000,270520.412288,6.344851e+05
19589,test_09798,7500,US,30,theater,plays,"<div class=""contents""><div><p>Have you ever re...",,Have you ever read a book or seen a movie and ...,theaterplays,...,30.787500,11.947561,8.333333,5642.857143,327.681122,599.112506,2000,6000000,375081.250000,8.739281e+05


In [23]:
train_feature = feature_df.iloc[:train_len,:]
test_feature = feature_df.iloc[train_len:,:]
test_feature = test_feature.reset_index(drop=True)

train_feature.to_csv(os.path.join(OUTPUT_DIR,"feature_train.csv"),index=False)
test_feature.to_csv(os.path.join(OUTPUT_DIR,"feature_test.csv"),index=False)

display(train_feature.head(3))
display(test_feature.head(3))

Unnamed: 0,id,goal,country,duration,category1,category2,html_content,state,content,category3,...,country_category1_duration_mean,country_category1_duration_std,country_category1_ratio_goal_durataion_min,country_category1_ratio_goal_durataion_max,country_category1_ratio_goal_durataion_mean,country_category1_ratio_goal_durataion_std,country_category1_prod_goal_durataion_min,country_category1_prod_goal_durataion_max,country_category1_prod_goal_durataion_mean,country_category1_prod_goal_durataion_std
0,train_00000,20500,US,45,art,mixed media,"<div class=""contents""><div><p><a href=""http://...",1.0,"The Shillito's Elves attracted close to 100,00...",artmixed media,...,31.60076,12.620717,6.25,12500.0,244.48243,578.729051,500,6000000,249136.882129,550017.4
1,train_00001,19500,US,59,food,restaurants,"<div class=""contents""><div><p>Cultural Pretzel...",0.0,Cultural Pretzel Sports Bar is a place where p...,foodrestaurants,...,33.732731,11.964994,8.333333,5882.352941,618.340636,764.186355,1500,6000000,736182.698515,1047290.0
2,train_00002,2500,US,38,art,performance art,"<div class=""contents""><div><p>I want to perfor...",0.0,"I want to perform this piece guerilla style, o...",artperformance art,...,31.60076,12.620717,6.25,12500.0,244.48243,578.729051,500,6000000,249136.882129,550017.4


Unnamed: 0,id,goal,country,duration,category1,category2,html_content,state,content,category3,...,country_category1_duration_mean,country_category1_duration_std,country_category1_ratio_goal_durataion_min,country_category1_ratio_goal_durataion_max,country_category1_ratio_goal_durataion_mean,country_category1_ratio_goal_durataion_std,country_category1_prod_goal_durataion_min,country_category1_prod_goal_durataion_max,country_category1_prod_goal_durataion_mean,country_category1_prod_goal_durataion_std
0,test_00000,5500,FR,30,dance,performances,"<div class=""contents""><div><p>Bonjour ,</p><p>...",,"Bonjour ,Je m'appelle Morgane Hilgers. Je suis...",danceperformances,...,30.0,0.0,50.0,516.666667,233.333333,199.071921,45000,465000,210000.0,179164.728672
1,test_00001,6500,GB,23,publishing,children's books,"<div class=""contents""><div><p><span class=""bol...",,The projectThe hidden world of microorganisms ...,publishingchildren's books,...,31.967213,9.880551,8.333333,3333.333333,267.040476,543.368913,5000,3000000,249453.551913,472767.270506
2,test_00002,6500,GB,30,theater,plays,"<div class=""contents""><div><p>COW is a rural t...",,"COW is a rural tragicomedy with songs, written...",theaterplays,...,30.329897,12.010549,8.333333,1950.0,140.056098,222.406743,3000,1755000,131907.216495,230673.982419
