# Combined Text Data EDA Prep

This notebook includes:
* Converting RDS table to CSV
* Converting features to smaller datatypes.

### Imports and Global Settings

In [1]:
import tempfile
import numpy as np
import pandas as pd
# Connecting to Postgres RDS on AWS
from sqlalchemy import create_engine
from sqlalchemy.dialects import postgresql

pd.set_option('display.float_format', lambda x: '%.5f' % x)
pd.set_option("display.max_columns", 200)
pd.set_option("display.max_rows", 200)

### Import and Convert Data

In [2]:
db_endpoint = None
db_name = "yelp_2021_db"
db_password = None

In [3]:
engine = create_engine(f"postgresql+psycopg2://postgres:{db_password}@{db_endpoint}/{db_name}")

In [4]:
train_query = "SELECT * FROM text_combined_train"
test_query = "SELECT * FROM text_combined_test"

In [5]:
datatypes = {'target_reg': 'int16',
 'review_stars': 'int16',
 'NB_prob': 'float32',
 'svm_pred': 'float32',
 'grade_level': 'float32',
 'polarity': 'float32',
 'subjectivity': 'float32',
 'word_cnt': 'int16',
 'character_cnt': 'int16',
 'num_cnt': 'int16',
 'uppercase_cnt': 'int16',
 '#@_cnt': 'int16',
 'sentence_cnt': 'int16',
 'lexicon_cnt': 'int16',
 'syllable_cnt': 'int16',
 'avg_word_len': 'float32',
 'token_cnt': 'int16',
 'stopword_cnt': 'int16',
 'stopword_pct': 'float32',
 'ent_cnt': 'int16',
 'ent_pct': 'float32',
 'pos_adj_pct': 'float32',
 'pos_adj_cnt': 'int16',
 'pos_adp_pct': 'float32',
 'pos_adp_cnt': 'int16',
 'pos_adv_pct': 'float32',
 'pos_adv_cnt': 'int16',
 'pos_aux_pct': 'float32',
 'pos_aux_cnt': 'int16',
 'pos_conj_pct': 'float32',
 'pos_conj_cnt': 'int16',
 'pos_det_pct': 'float32',
 'pos_det_cnt': 'int16',
 'pos_intj_pct': 'float32',
 'pos_intj_cnt': 'int16',
 'pos_noun_pct': 'float32',
 'pos_noun_cnt': 'int16',
 'pos_num_pct': 'float32',
 'pos_num_cnt': 'int16',
 'pos_part_pct': 'float32',
 'pos_part_cnt': 'int16',
 'pos_pron_pct': 'float32',
 'pos_pron_cnt': 'int16',
 'pos_propn_pct': 'float32',
 'pos_propn_cnt': 'int16',
 'pos_punct_pct': 'float32',
 'pos_punct_cnt': 'int16',
 'pos_sconj_pct': 'float32',
 'pos_sconj_cnt': 'int16',
 'pos_sym_pct': 'float32',
 'pos_sym_cnt': 'int16',
 'pos_verb_pct': 'float32',
 'pos_verb_cnt': 'int16',
 'pos_x_pct': 'float32',
 'pos_x_cnt': 'int16',
 'dep_root_pct': 'float32',
 'dep_root_cnt': 'int16',
 'dep_acl_pct': 'float32',
 'dep_acl_cnt': 'int16',
 'dep_acomp_pct': 'float32',
 'dep_acomp_cnt': 'int16',
 'dep_advcl_pct': 'float32',
 'dep_advcl_cnt': 'int16',
 'dep_advmod_pct': 'float32',
 'dep_advmod_cnt': 'int16',
 'dep_agent_pct': 'float32',
 'dep_agent_cnt': 'int16',
 'dep_amod_pct': 'float32',
 'dep_amod_cnt': 'int16',
 'dep_appos_pct': 'float32',
 'dep_appos_cnt': 'int16',
 'dep_attr_pct': 'float32',
 'dep_attr_cnt': 'int16',
 'dep_aux_pct': 'float32',
 'dep_aux_cnt': 'int16',
 'dep_auxpass_pct': 'float32',
 'dep_auxpass_cnt': 'int16',
 'dep_case_pct': 'float32',
 'dep_case_cnt': 'int16',
 'dep_cc_pct': 'float32',
 'dep_cc_cnt': 'int16',
 'dep_ccomp_pct': 'float32',
 'dep_ccomp_cnt': 'int16',
 'dep_compound_pct': 'float32',
 'dep_compound_cnt': 'int16',
 'dep_conj_pct': 'float32',
 'dep_conj_cnt': 'int16',
 'dep_csubj_pct': 'float32',
 'dep_csubj_cnt': 'int16',
 'dep_csubjpass_pct': 'float32',
 'dep_csubjpass_cnt': 'int16',
 'dep_dative_pct': 'float32',
 'dep_dative_cnt': 'int16',
 'dep_dep_pct': 'float32',
 'dep_dep_cnt': 'int16',
 'dep_det_pct': 'float32',
 'dep_det_cnt': 'int16',
 'dep_dobj_pct': 'float32',
 'dep_dobj_cnt': 'int16',
 'dep_expl_pct': 'float32',
 'dep_expl_cnt': 'int16',
 'dep_intj_pct': 'float32',
 'dep_intj_cnt': 'int16',
 'dep_mark_pct': 'float32',
 'dep_mark_cnt': 'int16',
 'dep_meta_pct': 'float32',
 'dep_meta_cnt': 'int16',
 'dep_neg_pct': 'float32',
 'dep_neg_cnt': 'int16',
 'dep_nmod_pct': 'float32',
 'dep_nmod_cnt': 'int16',
 'dep_npadvmod_pct': 'float32',
 'dep_npadvmod_cnt': 'int16',
 'dep_nsubj_pct': 'float32',
 'dep_nsubj_cnt': 'int16',
 'dep_nsubjpass_pct': 'float32',
 'dep_nsubjpass_cnt': 'int16',
 'dep_nummod_pct': 'float32',
 'dep_nummod_cnt': 'int16',
 'dep_oprd_pct': 'float32',
 'dep_oprd_cnt': 'int16',
 'dep_parataxis_pct': 'float32',
 'dep_parataxis_cnt': 'int16',
 'dep_pcomp_pct': 'float32',
 'dep_pcomp_cnt': 'int16',
 'dep_pobj_pct': 'float32',
 'dep_pobj_cnt': 'int16',
 'dep_poss_pct': 'float32',
 'dep_poss_cnt': 'int16',
 'dep_preconj_pct': 'float32',
 'dep_preconj_cnt': 'int16',
 'dep_predet_pct': 'float32',
 'dep_predet_cnt': 'int16',
 'dep_prep_pct': 'float32',
 'dep_prep_cnt': 'int16',
 'dep_prt_pct': 'float32',
 'dep_prt_cnt': 'int16',
 'dep_punct_pct': 'float32',
 'dep_punct_cnt': 'int16',
 'dep_quantmod_pct': 'float32',
 'dep_quantmod_cnt': 'int16',
 'dep_relcl_pct': 'float32',
 'dep_relcl_cnt': 'int16',
 'dep_xcomp_pct': 'float32',
 'dep_xcomp_cnt': 'int16',
 'ent_cardinal_pct': 'float32',
 'ent_cardinal_cnt': 'int16',
 'ent_date_pct': 'float32',
 'ent_date_cnt': 'int16',
 'ent_event_pct': 'float32',
 'ent_event_cnt': 'int16',
 'ent_fac_pct': 'float32',
 'ent_fac_cnt': 'int16',
 'ent_gpe_pct': 'float32',
 'ent_gpe_cnt': 'int16',
 'ent_language_pct': 'float32',
 'ent_language_cnt': 'int16',
 'ent_law_pct': 'float32',
 'ent_law_cnt': 'int16',
 'ent_loc_pct': 'float32',
 'ent_loc_cnt': 'int16',
 'ent_money_pct': 'float32',
 'ent_money_cnt': 'int16',
 'ent_norp_pct': 'float32',
 'ent_norp_cnt': 'int16',
 'ent_ordinal_pct': 'float32',
 'ent_ordinal_cnt': 'int16',
 'ent_org_pct': 'float32',
 'ent_org_cnt': 'int16',
 'ent_percent_pct': 'float32',
 'ent_percent_cnt': 'int16',
 'ent_person_pct': 'float32',
 'ent_person_cnt': 'int16',
 'ent_product_pct': 'float32',
 'ent_product_cnt': 'int16',
 'ent_quantity_pct': 'float32',
 'ent_quantity_cnt': 'int16',
 'ent_time_pct': 'float32',
 'ent_time_cnt': 'int16',
 'ent_work_of_art_pct': 'float32',
 'ent_work_of_art_cnt': 'int16'}

In [6]:
def read_sql_tmpfile(query, db_engine, datatype_dict):
    with tempfile.TemporaryFile() as tmpfile:
        copy_sql = "COPY ({query}) TO STDOUT WITH CSV {head}".format(
           query=query, head="HEADER"
        )
        conn = db_engine.raw_connection()
        cur = conn.cursor()
        cur.copy_expert(copy_sql, tmpfile)
        tmpfile.seek(0)
        df = pd.read_csv(tmpfile, dtype=datatype_dict)
        return df

In [7]:
train = read_sql_tmpfile(train_query, engine, datatypes)
train = train.replace({'target_clf': {'t': True, 'f': False}})

In [8]:
train.info(verbose=True)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5523992 entries, 0 to 5523991
Data columns (total 183 columns):
 #    Column               Dtype  
---   ------               -----  
 0    review_id            object 
 1    target_clf           bool   
 2    target_reg           int16  
 3    review_stars         int16  
 4    NB_prob              float32
 5    svm_pred             float32
 6    grade_level          float32
 7    polarity             float32
 8    subjectivity         float32
 9    word_cnt             int16  
 10   character_cnt        int16  
 11   num_cnt              int16  
 12   uppercase_cnt        int16  
 13   #@_cnt               int16  
 14   sentence_cnt         int16  
 15   lexicon_cnt          int16  
 16   syllable_cnt         int16  
 17   avg_word_len         float32
 18   token_cnt            int16  
 19   stopword_cnt         int16  
 20   stopword_pct         float32
 21   ent_cnt              int16  
 22   ent_pct              float32
 23   pos_a

In [9]:
train.memory_usage(deep=True)

Index                        128
review_id              436395368
target_clf               5523992
target_reg              11047984
review_stars            11047984
NB_prob                 22095968
svm_pred                22095968
grade_level             22095968
polarity                22095968
subjectivity            22095968
word_cnt                11047984
character_cnt           11047984
num_cnt                 11047984
uppercase_cnt           11047984
#@_cnt                  11047984
sentence_cnt            11047984
lexicon_cnt             11047984
syllable_cnt            11047984
avg_word_len            22095968
token_cnt               11047984
stopword_cnt            11047984
stopword_pct            22095968
ent_cnt                 11047984
ent_pct                 22095968
pos_adj_pct             22095968
pos_adj_cnt             11047984
pos_adp_pct             22095968
pos_adp_cnt             11047984
pos_adv_pct             22095968
pos_adv_cnt             11047984
pos_aux_pc

In [10]:
test = read_sql_tmpfile(test_query, engine, datatypes)
test = test.replace({'target_clf': {'t': True, 'f': False}})

In [11]:
test.info(verbose=True)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1382379 entries, 0 to 1382378
Data columns (total 183 columns):
 #    Column               Dtype  
---   ------               -----  
 0    review_id            object 
 1    target_clf           bool   
 2    target_reg           int16  
 3    review_stars         int16  
 4    NB_prob              float32
 5    svm_pred             float32
 6    grade_level          float32
 7    polarity             float32
 8    subjectivity         float32
 9    word_cnt             int16  
 10   character_cnt        int16  
 11   num_cnt              int16  
 12   uppercase_cnt        int16  
 13   #@_cnt               int16  
 14   sentence_cnt         int16  
 15   lexicon_cnt          int16  
 16   syllable_cnt         int16  
 17   avg_word_len         float32
 18   token_cnt            int16  
 19   stopword_cnt         int16  
 20   stopword_pct         float32
 21   ent_cnt              int16  
 22   ent_pct              float32
 23   pos_a

In [12]:
test.memory_usage(deep=True)

Index                        128
review_id              109207941
target_clf               1382379
target_reg               2764758
review_stars             2764758
NB_prob                  5529516
svm_pred                 5529516
grade_level              5529516
polarity                 5529516
subjectivity             5529516
word_cnt                 2764758
character_cnt            2764758
num_cnt                  2764758
uppercase_cnt            2764758
#@_cnt                   2764758
sentence_cnt             2764758
lexicon_cnt              2764758
syllable_cnt             2764758
avg_word_len             5529516
token_cnt                2764758
stopword_cnt             2764758
stopword_pct             5529516
ent_cnt                  2764758
ent_pct                  5529516
pos_adj_pct              5529516
pos_adj_cnt              2764758
pos_adp_pct              5529516
pos_adp_cnt              2764758
pos_adv_pct              5529516
pos_adv_cnt              2764758
pos_aux_pc

### To CSV

In [13]:
filepath_prefix = "/home/jeff/Documents/Data_Science_Projects/Yelp_Reviews/data/full_data/model_ready/"

In [14]:
train.to_csv(f"{filepath_prefix}train.csv", index=False)

In [15]:
test.to_csv(f"{filepath_prefix}test.csv", index=False)