# Stage 3.2: Processed Combined Text Data EDA

Exploratory data analysis of the model-ready features after Stage 3 processing.

**Input**: `data/model_ready/{train,test}.csv` (from Stage 3)
**Features**: 186 NLP and text features for predicting review quality

This notebook contains:
* EDA of processed text data with Dataprep.eda and plotting libraries

## Imports and Global Settings

In [None]:
import sys
from pathlib import Path
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
# EDA libraries
from dataprep.eda import plot, plot_correlation

# Add project root to path for imports
sys.path.insert(0, str(Path.cwd().parent))
from src.config import PathConfig

pd.set_option('display.float_format', lambda x: '%.5f' % x)
pd.set_option("display.max_columns", 200)
pd.set_option("display.max_rows", 200)

## Loading Data

In [None]:
# Load from model-ready directory (Stage 3 output)
model_ready_dir = PathConfig.get_model_ready_dir()
print(f"Reading from: {model_ready_dir}")

In [None]:
# For large datasets, limit records loaded
# Set to None to load all records
train_records_to_load = None  # Load all test data (7000 rows)
test_records_to_load = None   # Load all test data (1500 rows)

In [None]:
# Datatypes are auto-inferred from CSV
# Stage 3 script already applied memory-efficient dtypes
# No manual dtype mapping needed

In [None]:
# Load data from CSV
train = pd.read_csv(model_ready_dir / "train.csv", nrows=train_records_to_load)
test = pd.read_csv(model_ready_dir / "test.csv", nrows=test_records_to_load)
print(f"Train: {len(train):,} rows, {len(train.columns)} columns")
print(f"Test: {len(test):,} rows, {len(test.columns)} columns")

In [None]:
# Separate features and targets
# Target columns start with T (T1_REG, T2_CLS, etc.)
target_cols = [c for c in train.columns if c.startswith('T')]
id_cols = ['review_id']
feature_cols = [c for c in train.columns if c not in target_cols + id_cols]

X_train = train[feature_cols]
X_test = test[feature_cols]
y_train = train['T2_CLS_ufc_>0']  # Primary classification target
y_test = test['T2_CLS_ufc_>0']

print(f"Features: {len(feature_cols)}")
print(f"Targets: {target_cols}")

## Basic Overview

In [7]:
train.columns.values

array(['review_id', 'target_clf', 'target_reg', 'review_stars', 'NB_prob',
       'svm_pred', 'grade_level', 'polarity', 'subjectivity', 'word_cnt',
       'character_cnt', 'num_cnt', 'uppercase_cnt', '#@_cnt',
       'sentence_cnt', 'lexicon_cnt', 'syllable_cnt', 'avg_word_len',
       'token_cnt', 'stopword_cnt', 'stopword_pct', 'ent_cnt', 'ent_pct',
       'pos_adj_pct', 'pos_adj_cnt', 'pos_adp_pct', 'pos_adp_cnt',
       'pos_adv_pct', 'pos_adv_cnt', 'pos_aux_pct', 'pos_aux_cnt',
       'pos_conj_pct', 'pos_conj_cnt', 'pos_det_pct', 'pos_det_cnt',
       'pos_intj_pct', 'pos_intj_cnt', 'pos_noun_pct', 'pos_noun_cnt',
       'pos_num_pct', 'pos_num_cnt', 'pos_part_pct', 'pos_part_cnt',
       'pos_pron_pct', 'pos_pron_cnt', 'pos_propn_pct', 'pos_propn_cnt',
       'pos_punct_pct', 'pos_punct_cnt', 'pos_sconj_pct', 'pos_sconj_cnt',
       'pos_sym_pct', 'pos_sym_cnt', 'pos_verb_pct', 'pos_verb_cnt',
       'pos_x_pct', 'pos_x_cnt', 'dep_root_pct', 'dep_root_cnt',
       'dep_acl_pct'

In [8]:
train.head(5)

Unnamed: 0,review_id,target_clf,target_reg,review_stars,NB_prob,svm_pred,grade_level,polarity,subjectivity,word_cnt,character_cnt,num_cnt,uppercase_cnt,#@_cnt,sentence_cnt,lexicon_cnt,syllable_cnt,avg_word_len,token_cnt,stopword_cnt,stopword_pct,ent_cnt,ent_pct,pos_adj_pct,pos_adj_cnt,pos_adp_pct,pos_adp_cnt,pos_adv_pct,pos_adv_cnt,pos_aux_pct,pos_aux_cnt,pos_conj_pct,pos_conj_cnt,pos_det_pct,pos_det_cnt,pos_intj_pct,pos_intj_cnt,pos_noun_pct,pos_noun_cnt,pos_num_pct,pos_num_cnt,pos_part_pct,pos_part_cnt,pos_pron_pct,pos_pron_cnt,pos_propn_pct,pos_propn_cnt,pos_punct_pct,pos_punct_cnt,pos_sconj_pct,pos_sconj_cnt,pos_sym_pct,pos_sym_cnt,pos_verb_pct,pos_verb_cnt,pos_x_pct,pos_x_cnt,dep_root_pct,dep_root_cnt,dep_acl_pct,dep_acl_cnt,dep_acomp_pct,dep_acomp_cnt,dep_advcl_pct,dep_advcl_cnt,dep_advmod_pct,dep_advmod_cnt,dep_agent_pct,dep_agent_cnt,dep_amod_pct,dep_amod_cnt,dep_appos_pct,dep_appos_cnt,dep_attr_pct,dep_attr_cnt,dep_aux_pct,dep_aux_cnt,dep_auxpass_pct,dep_auxpass_cnt,dep_case_pct,dep_case_cnt,dep_cc_pct,dep_cc_cnt,dep_ccomp_pct,dep_ccomp_cnt,dep_compound_pct,dep_compound_cnt,dep_conj_pct,dep_conj_cnt,dep_csubj_pct,dep_csubj_cnt,dep_csubjpass_pct,dep_csubjpass_cnt,dep_dative_pct,dep_dative_cnt,dep_dep_pct,dep_dep_cnt,dep_det_pct,dep_det_cnt,dep_dobj_pct,dep_dobj_cnt,dep_expl_pct,dep_expl_cnt,dep_intj_pct,dep_intj_cnt,dep_mark_pct,dep_mark_cnt,dep_meta_pct,dep_meta_cnt,dep_neg_pct,dep_neg_cnt,dep_nmod_pct,dep_nmod_cnt,dep_npadvmod_pct,dep_npadvmod_cnt,dep_nsubj_pct,dep_nsubj_cnt,dep_nsubjpass_pct,dep_nsubjpass_cnt,dep_nummod_pct,dep_nummod_cnt,dep_oprd_pct,dep_oprd_cnt,dep_parataxis_pct,dep_parataxis_cnt,dep_pcomp_pct,dep_pcomp_cnt,dep_pobj_pct,dep_pobj_cnt,dep_poss_pct,dep_poss_cnt,dep_preconj_pct,dep_preconj_cnt,dep_predet_pct,dep_predet_cnt,dep_prep_pct,dep_prep_cnt,dep_prt_pct,dep_prt_cnt,dep_punct_pct,dep_punct_cnt,dep_quantmod_pct,dep_quantmod_cnt,dep_relcl_pct,dep_relcl_cnt,dep_xcomp_pct,dep_xcomp_cnt,ent_cardinal_pct,ent_cardinal_cnt,ent_date_pct,ent_date_cnt,ent_event_pct,ent_event_cnt,ent_fac_pct,ent_fac_cnt,ent_gpe_pct,ent_gpe_cnt,ent_language_pct,ent_language_cnt,ent_law_pct,ent_law_cnt,ent_loc_pct,ent_loc_cnt,ent_money_pct,ent_money_cnt,ent_norp_pct,ent_norp_cnt,ent_ordinal_pct,ent_ordinal_cnt,ent_org_pct,ent_org_cnt,ent_percent_pct,ent_percent_cnt,ent_person_pct,ent_person_cnt,ent_product_pct,ent_product_cnt,ent_quantity_pct,ent_quantity_cnt,ent_time_pct,ent_time_cnt,ent_work_of_art_pct,ent_work_of_art_cnt
0,LMghnfV8h5_CxooL9NuYCg,False,0,1,0.994,-0.424,8.4,-0.22875,0.4,20,136,0,1,0,2,20,33,5.85,24,7,0.29167,1,0.04167,0.16667,4,0.04167,1,0.16667,4,0.04167,1,0.0,0,0.0,0,0.0,0,0.20833,5,0.08333,2,0.0,0,0.04167,1,0.0,0,0.16667,4,0.04167,1,0.0,0,0.04167,1,0.0,0,0.08333,2,0.0,0,0.04167,1,0.04167,1,0.16667,4,0.0,0,0.125,3,0.08333,2,0.0,0,0.0,0,0.0,0,0.0,0,0.0,0,0.0,0,0.0,0,0.0,0,0.0,0,0.0,0,0.0,0,0.04167,1,0.0,0,0.0,0,0.0,0,0.0,0,0.04167,1,0.0,0,0.0,0,0.0,0,0.0,0,0.08333,2,0.0,0,0.04167,1,0.0,0,0.0,0,0.0,0,0.04167,1,0.0,0,0.0,0,0.0,0,0.04167,1,0.0,0,0.16667,4,0.0,0,0.0,0,0.0,0,0.04167,1,0.0,0,0.0,0,0.0,0,0.0,0,0.0,0,0.0,0,0.0,0,0.0,0,0.0,0,0.0,0,0.0,0,0.0,0,0.0,0,0.0,0,0.0,0,0.0,0,0.0,0
1,3ppa5clQkm9Ha5knQHSiSw,True,12,4,0.0,0.602,6.9,0.50325,0.66942,215,1229,0,1,0,14,215,298,4.68982,243,108,0.44444,9,0.03704,0.09053,22,0.09465,23,0.06996,17,0.05761,14,0.0,0,0.107,26,0.0,0,0.18107,44,0.0,0,0.00823,2,0.0535,13,0.08642,21,0.08642,21,0.01235,3,0.00412,1,0.09053,22,0.0,0,0.06173,15,0.00412,1,0.02058,5,0.01235,3,0.06584,16,0.0,0,0.05761,14,0.00412,1,0.02058,5,0.02469,6,0.01235,3,0.0,0,0.0535,13,0.01235,3,0.07819,19,0.06584,16,0.0,0,0.0,0,0.0,0,0.0,0,0.09053,22,0.02881,7,0.0,0,0.0,0,0.00823,2,0.0,0,0.00412,1,0.0,0,0.0,0,0.06996,17,0.01235,3,0.0,0,0.0,0,0.0,0,0.00412,1,0.07407,18,0.00412,1,0.0,0,0.0,0,0.0823,20,0.01235,3,0.09053,22,0.0,0,0.01646,4,0.00412,1,0.0,0,0.0,0,0.0,0,0.0,0,0.00412,1,0.0,0,0.0,0,0.0,0,0.0,0,0.0,0,0.0,0,0.02058,5,0.0,0,0.00412,1,0.0,0,0.0,0,0.00412,1,0.00412,1
2,9KArDVboj7XP3MIfohbxbQ,True,2,5,0.0,1.004,147.60001,0.17134,0.54361,386,2076,1,8,0,1,379,501,4.40682,482,213,0.44191,31,0.06432,0.06432,31,0.08921,43,0.04357,21,0.04149,20,0.0,0,0.11618,56,0.00415,2,0.13278,64,0.01245,6,0.0166,8,0.05809,28,0.09336,45,0.1556,75,0.0083,4,0.00207,1,0.08921,43,0.0,0,0.04357,21,0.0,0,0.01037,5,0.01452,7,0.04564,22,0.00622,3,0.04149,20,0.00622,3,0.0166,8,0.02697,13,0.01037,5,0.00207,1,0.03942,19,0.00622,3,0.05187,25,0.03942,19,0.0,0,0.0,0,0.00415,2,0.00622,3,0.08091,39,0.0332,16,0.0,0,0.00207,1,0.00207,1,0.0,0,0.00415,2,0.0,0,0.01452,7,0.07054,34,0.00622,3,0.0083,4,0.0,0,0.0,0,0.00415,2,0.08299,40,0.02282,11,0.0,0,0.0,0,0.07884,38,0.00415,2,0.1556,75,0.00415,2,0.0083,4,0.01037,5,0.0083,4,0.0083,4,0.0,0,0.0,0,0.00207,1,0.0,0,0.0,0,0.00207,1,0.00207,1,0.01037,5,0.00207,1,0.01245,6,0.0,0,0.01245,6,0.00207,1,0.0,0,0.00207,1,0.0,0
3,8SJpMLd9xfLPDS6aGtLOXw,False,0,5,0.0,-0.801,3.3,0.52292,0.5875,46,244,0,0,0,5,46,58,4.32609,51,25,0.4902,2,0.03922,0.07843,4,0.05882,3,0.01961,1,0.09804,5,0.0,0,0.07843,4,0.0,0,0.19608,10,0.0,0,0.03922,2,0.09804,5,0.03922,2,0.09804,5,0.0,0,0.0,0,0.11765,6,0.0,0,0.09804,5,0.0,0,0.01961,1,0.01961,1,0.01961,1,0.0,0,0.01961,1,0.0,0,0.01961,1,0.03922,2,0.01961,1,0.0,0,0.07843,4,0.0,0,0.05882,3,0.05882,3,0.0,0,0.0,0,0.0,0,0.0,0,0.07843,4,0.07843,4,0.0,0,0.0,0,0.0,0,0.0,0,0.0,0,0.03922,2,0.0,0,0.11765,6,0.0,0,0.0,0,0.0,0,0.0,0,0.0,0,0.03922,2,0.0,0,0.0,0,0.0,0,0.05882,3,0.0,0,0.09804,5,0.0,0,0.01961,1,0.01961,1,0.0,0,0.01961,1,0.0,0,0.0,0,0.01961,1,0.0,0,0.0,0,0.0,0,0.0,0,0.0,0,0.0,0,0.0,0,0.0,0,0.0,0,0.0,0,0.0,0,0.0,0,0.0,0
4,uEaiFHX-rIMJvK8Tgq1OwA,True,3,4,1.0,1.004,11.8,0.23823,0.56273,338,1818,1,16,0,11,339,448,4.33235,389,211,0.54242,18,0.04627,0.05913,23,0.0874,34,0.07198,28,0.07198,28,0.0,0,0.0874,34,0.0,0,0.13625,53,0.01542,6,0.02828,11,0.11825,46,0.03856,15,0.07455,29,0.02571,10,0.00771,3,0.12596,49,0.0,0,0.05398,21,0.0,0,0.01799,7,0.02314,9,0.07712,30,0.0,0,0.02828,11,0.0,0,0.00514,2,0.04884,19,0.00514,2,0.00257,1,0.03856,15,0.03085,12,0.01028,4,0.02571,10,0.0,0,0.0,0,0.00514,2,0.0,0,0.05141,20,0.08997,35,0.0,0,0.0,0,0.02057,8,0.0,0,0.01028,4,0.00771,3,0.0,0,0.12082,47,0.00771,3,0.00771,3,0.0,0,0.0,0,0.01028,4,0.06941,27,0.02057,8,0.0,0,0.0,0,0.0874,34,0.00514,2,0.07712,30,0.0,0,0.02571,10,0.00257,1,0.00514,2,0.00514,2,0.0,0,0.0,0,0.00771,3,0.0,0,0.0,0,0.0,0,0.00514,2,0.0,0,0.0,0,0.01799,7,0.0,0,0.00257,1,0.0,0,0.0,0,0.0,0,0.00257,1


In [9]:
train.info(verbose=True)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 100000 entries, 0 to 99999
Data columns (total 183 columns):
 #    Column               Dtype  
---   ------               -----  
 0    review_id            object 
 1    target_clf           bool   
 2    target_reg           int16  
 3    review_stars         int16  
 4    NB_prob              float32
 5    svm_pred             float32
 6    grade_level          float32
 7    polarity             float32
 8    subjectivity         float32
 9    word_cnt             int16  
 10   character_cnt        int16  
 11   num_cnt              int16  
 12   uppercase_cnt        int16  
 13   #@_cnt               int16  
 14   sentence_cnt         int16  
 15   lexicon_cnt          int16  
 16   syllable_cnt         int16  
 17   avg_word_len         float32
 18   token_cnt            int16  
 19   stopword_cnt         int16  
 20   stopword_pct         float32
 21   ent_cnt              int16  
 22   ent_pct              float32
 23   pos_adj_

In [10]:
train.describe()

Unnamed: 0,target_reg,review_stars,NB_prob,svm_pred,grade_level,polarity,subjectivity,word_cnt,character_cnt,num_cnt,uppercase_cnt,#@_cnt,sentence_cnt,lexicon_cnt,syllable_cnt,avg_word_len,token_cnt,stopword_cnt,stopword_pct,ent_cnt,ent_pct,pos_adj_pct,pos_adj_cnt,pos_adp_pct,pos_adp_cnt,pos_adv_pct,pos_adv_cnt,pos_aux_pct,pos_aux_cnt,pos_conj_pct,pos_conj_cnt,pos_det_pct,pos_det_cnt,pos_intj_pct,pos_intj_cnt,pos_noun_pct,pos_noun_cnt,pos_num_pct,pos_num_cnt,pos_part_pct,pos_part_cnt,pos_pron_pct,pos_pron_cnt,pos_propn_pct,pos_propn_cnt,pos_punct_pct,pos_punct_cnt,pos_sconj_pct,pos_sconj_cnt,pos_sym_pct,pos_sym_cnt,pos_verb_pct,pos_verb_cnt,pos_x_pct,pos_x_cnt,dep_root_pct,dep_root_cnt,dep_acl_pct,dep_acl_cnt,dep_acomp_pct,dep_acomp_cnt,dep_advcl_pct,dep_advcl_cnt,dep_advmod_pct,dep_advmod_cnt,dep_agent_pct,dep_agent_cnt,dep_amod_pct,dep_amod_cnt,dep_appos_pct,dep_appos_cnt,dep_attr_pct,dep_attr_cnt,dep_aux_pct,dep_aux_cnt,dep_auxpass_pct,dep_auxpass_cnt,dep_case_pct,dep_case_cnt,dep_cc_pct,dep_cc_cnt,dep_ccomp_pct,dep_ccomp_cnt,dep_compound_pct,dep_compound_cnt,dep_conj_pct,dep_conj_cnt,dep_csubj_pct,dep_csubj_cnt,dep_csubjpass_pct,dep_csubjpass_cnt,dep_dative_pct,dep_dative_cnt,dep_dep_pct,dep_dep_cnt,dep_det_pct,dep_det_cnt,dep_dobj_pct,dep_dobj_cnt,dep_expl_pct,dep_expl_cnt,dep_intj_pct,dep_intj_cnt,dep_mark_pct,dep_mark_cnt,dep_meta_pct,dep_meta_cnt,dep_neg_pct,dep_neg_cnt,dep_nmod_pct,dep_nmod_cnt,dep_npadvmod_pct,dep_npadvmod_cnt,dep_nsubj_pct,dep_nsubj_cnt,dep_nsubjpass_pct,dep_nsubjpass_cnt,dep_nummod_pct,dep_nummod_cnt,dep_oprd_pct,dep_oprd_cnt,dep_parataxis_pct,dep_parataxis_cnt,dep_pcomp_pct,dep_pcomp_cnt,dep_pobj_pct,dep_pobj_cnt,dep_poss_pct,dep_poss_cnt,dep_preconj_pct,dep_preconj_cnt,dep_predet_pct,dep_predet_cnt,dep_prep_pct,dep_prep_cnt,dep_prt_pct,dep_prt_cnt,dep_punct_pct,dep_punct_cnt,dep_quantmod_pct,dep_quantmod_cnt,dep_relcl_pct,dep_relcl_cnt,dep_xcomp_pct,dep_xcomp_cnt,ent_cardinal_pct,ent_cardinal_cnt,ent_date_pct,ent_date_cnt,ent_event_pct,ent_event_cnt,ent_fac_pct,ent_fac_cnt,ent_gpe_pct,ent_gpe_cnt,ent_language_pct,ent_language_cnt,ent_law_pct,ent_law_cnt,ent_loc_pct,ent_loc_cnt,ent_money_pct,ent_money_cnt,ent_norp_pct,ent_norp_cnt,ent_ordinal_pct,ent_ordinal_cnt,ent_org_pct,ent_org_cnt,ent_percent_pct,ent_percent_cnt,ent_person_pct,ent_person_cnt,ent_product_pct,ent_product_cnt,ent_quantity_pct,ent_quantity_cnt,ent_time_pct,ent_time_cnt,ent_work_of_art_pct,ent_work_of_art_cnt
count,100000.0,100000.0,100000.0,100000.0,100000.0,100000.0,100000.0,100000.0,100000.0,100000.0,100000.0,100000.0,100000.0,100000.0,100000.0,100000.0,100000.0,100000.0,100000.0,100000.0,100000.0,100000.0,100000.0,100000.0,100000.0,100000.0,100000.0,100000.0,100000.0,100000.0,100000.0,100000.0,100000.0,100000.0,100000.0,100000.0,100000.0,100000.0,100000.0,100000.0,100000.0,100000.0,100000.0,100000.0,100000.0,100000.0,100000.0,100000.0,100000.0,100000.0,100000.0,100000.0,100000.0,100000.0,100000.0,100000.0,100000.0,100000.0,100000.0,100000.0,100000.0,100000.0,100000.0,100000.0,100000.0,100000.0,100000.0,100000.0,100000.0,100000.0,100000.0,100000.0,100000.0,100000.0,100000.0,100000.0,100000.0,100000.0,100000.0,100000.0,100000.0,100000.0,100000.0,100000.0,100000.0,100000.0,100000.0,100000.0,100000.0,100000.0,100000.0,100000.0,100000.0,100000.0,100000.0,100000.0,100000.0,100000.0,100000.0,100000.0,100000.0,100000.0,100000.0,100000.0,100000.0,100000.0,100000.0,100000.0,100000.0,100000.0,100000.0,100000.0,100000.0,100000.0,100000.0,100000.0,100000.0,100000.0,100000.0,100000.0,100000.0,100000.0,100000.0,100000.0,100000.0,100000.0,100000.0,100000.0,100000.0,100000.0,100000.0,100000.0,100000.0,100000.0,100000.0,100000.0,100000.0,100000.0,100000.0,100000.0,100000.0,100000.0,100000.0,100000.0,100000.0,100000.0,100000.0,100000.0,100000.0,100000.0,100000.0,100000.0,100000.0,100000.0,100000.0,100000.0,100000.0,100000.0,100000.0,100000.0,100000.0,100000.0,100000.0,100000.0,100000.0,100000.0,100000.0,100000.0,100000.0,100000.0,100000.0,100000.0,100000.0,100000.0,100000.0,100000.0,100000.0,100000.0,100000.0,100000.0,100000.0
mean,2.16936,3.73725,0.37967,-0.01092,13.29147,0.23611,0.55976,113.05316,608.13488,0.48582,3.77081,0.01316,5.36395,111.62863,149.13946,4.48989,132.47707,65.36788,0.47746,4.43946,0.03372,0.08764,10.08547,0.06871,9.70422,0.06714,8.71505,0.06816,8.87172,0.0,0.0,0.10273,14.01826,0.00216,0.31624,0.16565,21.19539,0.01006,1.48073,0.02501,3.58808,0.07375,10.58333,0.03548,4.43624,0.11542,14.72319,0.01444,2.2197,0.00239,0.36184,0.10022,14.02543,0.00051,0.07531,0.08057,9.30349,0.0024,0.35688,0.02514,2.90394,0.0138,2.08381,0.0654,8.48987,0.00055,0.08294,0.05078,5.95004,0.004,0.52218,0.01183,1.58587,0.03656,5.17799,0.00462,0.71793,0.0017,0.24203,0.04166,5.30313,0.01388,2.12016,0.0331,4.16359,0.04264,5.37521,0.00038,0.05598,1e-05,0.00141,0.00174,0.25485,0.00163,0.24052,0.0755,10.18334,0.04437,6.05233,0.00163,0.25501,0.00134,0.20937,0.01072,1.71102,4e-05,0.00765,0.01154,1.60127,0.0032,0.44084,0.00737,0.96613,0.09388,12.58931,0.00357,0.56217,0.00671,0.98313,0.00085,0.12011,0.00063,0.11143,0.00406,0.60191,0.05903,8.2813,0.01696,2.34851,0.00032,0.04728,0.00106,0.13801,0.06353,8.92526,0.00598,0.87768,0.11607,14.81623,0.00132,0.19858,0.00941,1.32053,0.00964,1.43015,0.00514,0.74752,0.00415,0.56322,6e-05,0.00946,0.00051,0.07078,0.00356,0.42099,6e-05,0.00768,3e-05,0.00397,0.00046,0.05966,0.00139,0.21533,0.00219,0.24865,0.00141,0.19783,0.00471,0.60598,0.00016,0.02346,0.00631,0.7597,0.00027,0.03792,0.00035,0.04952,0.00269,0.37616,0.00028,0.04163
std,6.35054,1.45375,0.45833,1.45595,20.09798,0.22831,0.1332,105.00255,562.66786,1.07196,5.18972,0.1762,4.81956,103.84695,138.28045,1.47024,123.28994,63.28882,0.08213,5.05216,0.02534,0.04119,8.4878,0.02982,10.32288,0.03321,8.4911,0.02807,8.30071,0.0,0.0,0.03272,13.76611,0.00706,1.33846,0.04525,19.557,0.01393,2.36694,0.01904,4.29118,0.03552,11.72827,0.04018,6.28359,0.0404,14.48667,0.01372,2.98231,0.00705,0.99905,0.03812,14.94405,0.00369,0.38928,0.03039,7.71269,0.00569,0.75378,0.0208,2.76825,0.0132,2.75056,0.03271,8.28852,0.0027,0.31891,0.03439,5.72156,0.00894,1.05085,0.01265,1.95355,0.02252,5.95829,0.00785,1.29741,0.00527,0.68208,0.02094,5.0426,0.01405,3.00106,0.03027,5.18904,0.02491,5.28484,0.00221,0.24897,0.00035,0.03752,0.00481,0.61868,0.00676,1.04557,0.02952,10.0536,0.02351,6.43419,0.00452,0.63828,0.00451,0.61152,0.01167,2.52292,0.00083,0.11658,0.01308,2.11975,0.00822,1.02044,0.01065,1.37853,0.02958,12.11116,0.00685,1.08651,0.01056,1.63969,0.00343,0.38339,0.00254,0.38149,0.00739,1.07291,0.02694,8.78219,0.01642,3.02858,0.00198,0.23001,0.00397,0.40547,0.02825,9.44189,0.00906,1.41147,0.04003,14.53836,0.0048,0.60303,0.01104,1.76047,0.01132,2.03996,0.00951,1.39084,0.00824,1.11447,0.00092,0.10579,0.00275,0.31479,0.00881,0.90946,0.00103,0.10928,0.00079,0.06553,0.00278,0.28727,0.00479,0.70264,0.00704,0.71345,0.00456,0.5481,0.00941,1.16717,0.00161,0.17897,0.01173,1.38834,0.00196,0.22186,0.0023,0.25375,0.00675,0.88535,0.00195,0.23353
min,0.0,1.0,0.0,-29.943,-10.6,-1.0,0.0,1.0,2.0,0.0,0.0,0.0,1.0,1.0,1.0,2.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.00796,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,0.0,3.0,0.0,-0.888,4.9,0.10231,0.48154,45.0,245.0,0.0,1.0,0.0,2.0,44.0,60.0,4.21667,53.0,24.0,0.4375,1.0,0.01698,0.06061,5.0,0.0503,3.0,0.04615,3.0,0.05128,3.0,0.0,0.0,0.08456,5.0,0.0,0.0,0.13725,9.0,0.0,0.0,0.01176,1.0,0.05085,3.0,0.01053,1.0,0.08929,6.0,0.0,0.0,0.0,0.0,0.07692,5.0,0.0,0.0,0.06061,4.0,0.0,0.0,0.01109,1.0,0.0,0.0,0.04478,3.0,0.0,0.0,0.02941,2.0,0.0,0.0,0.0,0.0,0.0219,1.0,0.0,0.0,0.0,0.0,0.02888,2.0,0.0,0.0,0.01429,1.0,0.02703,2.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.05769,4.0,0.02985,2.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.07692,5.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.04255,3.0,0.0,0.0,0.0,0.0,0.0,0.0,0.04622,3.0,0.0,0.0,0.09009,6.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
50%,1.0,4.0,0.018,-0.446,7.2,0.23891,0.55779,81.0,437.0,0.0,2.0,0.0,4.0,80.0,107.0,4.42029,95.0,46.0,0.48837,3.0,0.03061,0.08125,8.0,0.06915,7.0,0.06452,6.0,0.06726,7.0,0.0,0.0,0.10417,10.0,0.0,0.0,0.16154,15.0,0.00542,1.0,0.02381,2.0,0.07375,7.0,0.02717,3.0,0.11111,10.0,0.01333,1.0,0.0,0.0,0.1,9.0,0.0,0.0,0.075,7.0,0.0,0.0,0.02183,2.0,0.01258,1.0,0.0625,6.0,0.0,0.0,0.04451,4.0,0.0,0.0,0.0098,1.0,0.03571,3.0,0.0,0.0,0.0,0.0,0.04032,4.0,0.01205,1.0,0.02817,3.0,0.04,4.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.07563,7.0,0.04348,4.0,0.0,0.0,0.0,0.0,0.00893,1.0,0.0,0.0,0.00917,1.0,0.0,0.0,0.00298,1.0,0.09489,9.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.05914,6.0,0.01462,1.0,0.0,0.0,0.0,0.0,0.06383,6.0,0.0,0.0,0.11111,10.0,0.0,0.0,0.00714,1.0,0.0073,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
75%,2.0,5.0,0.999,0.425,11.6,0.37416,0.6381,144.0,772.0,1.0,5.0,0.0,7.0,142.0,189.0,4.66038,168.0,84.0,0.53125,6.0,0.04651,0.10714,13.0,0.08696,13.0,0.08475,11.0,0.08411,12.0,0.0,0.0,0.12264,18.0,0.0,0.0,0.18947,27.0,0.01587,2.0,0.03587,5.0,0.09615,14.0,0.04905,6.0,0.13542,18.0,0.02273,3.0,0.0,0.0,0.12418,18.0,0.0,0.0,0.09402,12.0,0.00153,1.0,0.03509,4.0,0.02174,3.0,0.08333,11.0,0.0,0.0,0.06452,8.0,0.0051,1.0,0.01835,2.0,0.05,7.0,0.00758,1.0,0.0,0.0,0.05294,7.0,0.0219,3.0,0.04545,5.0,0.05556,7.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.09333,13.0,0.05791,8.0,0.0,0.0,0.0,0.0,0.01766,2.0,0.0,0.0,0.01802,2.0,0.00256,1.0,0.01163,1.0,0.1118,16.0,0.00549,1.0,0.01068,1.0,0.0,0.0,0.0,0.0,0.00642,1.0,0.07534,11.0,0.02564,3.0,0.0,0.0,0.0,0.0,0.08081,12.0,0.0099,1.0,0.13636,18.0,0.0,0.0,0.01515,2.0,0.01566,2.0,0.00772,1.0,0.00592,1.0,0.0,0.0,0.0,0.0,0.00294,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.00662,1.0,0.0,0.0,0.00909,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
max,354.0,5.0,1.0,29.789,386.20001,1.0,1.0,1063.0,5000.0,36.0,159.0,21.0,63.0,1015.0,1296.0,210.0,1184.0,674.0,0.78261,88.0,0.5,0.5,103.0,0.27273,114.0,0.33333,100.0,0.25,101.0,0.0,0.0,0.33333,147.0,0.8939,337.0,0.66667,220.0,0.5,48.0,0.23077,64.0,0.2963,180.0,1.0,305.0,1.0,206.0,0.17647,45.0,0.5,58.0,0.5,183.0,0.5,17.0,1.0,110.0,0.14286,17.0,0.25,32.0,0.14286,42.0,0.33333,100.0,0.09091,7.0,0.5,83.0,0.22222,36.0,0.18182,26.0,0.2,83.0,0.11111,30.0,0.11111,16.0,0.23077,61.0,0.25,52.0,0.88859,335.0,0.33333,88.0,0.14286,5.0,0.04,1.0,0.08889,12.0,0.5,101.0,0.5,113.0,0.5,87.0,0.08333,15.0,0.11111,15.0,0.14286,44.0,0.07407,5.0,0.23077,33.0,0.5,50.0,0.25,34.0,0.28571,153.0,0.15385,22.0,0.25,37.0,0.09524,9.0,0.08,7.0,0.16667,18.0,0.27273,104.0,0.23077,50.0,0.05263,6.0,0.125,7.0,0.27273,113.0,0.13636,23.0,0.875,205.0,0.29545,17.0,0.125,21.0,0.125,27.0,0.25,31.0,0.33333,36.0,0.05882,4.0,0.09091,7.0,0.5,25.0,0.1,7.0,0.05556,3.0,0.2,9.0,0.125,20.0,0.25,17.0,0.1,14.0,0.5,25.0,0.09091,5.0,0.25,38.0,0.07317,8.0,0.08824,6.0,0.13333,18.0,0.09091,7.0


In [11]:
train.isnull().sum()

review_id              0
target_clf             0
target_reg             0
review_stars           0
NB_prob                0
svm_pred               0
grade_level            0
polarity               0
subjectivity           0
word_cnt               0
character_cnt          0
num_cnt                0
uppercase_cnt          0
#@_cnt                 0
sentence_cnt           0
lexicon_cnt            0
syllable_cnt           0
avg_word_len           0
token_cnt              0
stopword_cnt           0
stopword_pct           0
ent_cnt                0
ent_pct                0
pos_adj_pct            0
pos_adj_cnt            0
pos_adp_pct            0
pos_adp_cnt            0
pos_adv_pct            0
pos_adv_cnt            0
pos_aux_pct            0
pos_aux_cnt            0
pos_conj_pct           0
pos_conj_cnt           0
pos_det_pct            0
pos_det_cnt            0
pos_intj_pct           0
pos_intj_cnt           0
pos_noun_pct           0
pos_noun_cnt           0
pos_num_pct            0


In [12]:
train.duplicated().sum()

0

## Univariate Analysis and Comparision to Target

In [None]:
def describe_numeric(df, column_name):
    """Plot histogram and boxplot for a numeric column."""
    fig, ax = plt.subplots(1, 2, figsize=(14, 5))
    ax[0].ticklabel_format(style='plain', axis='both')
    ax[1].ticklabel_format(style='plain', axis='both')
    sns.histplot(df[column_name], kde=True, ax=ax[1])
    sns.boxplot(y=df[column_name], color='lightblue', orient='v', ax=ax[0])
    fig.suptitle(column_name.replace('_', ' ').title(), fontsize=16)
    plt.tight_layout()
    
def compare_feature_target(df, feature_name, target_col='T2_CLS_ufc_>0'):
    """Compare feature distribution across target classes."""
    fig, ax = plt.subplots(1, 2, figsize=(14, 5))
    ax[0].ticklabel_format(style='plain', axis='both')
    ax[1].ticklabel_format(style='plain', axis='both')
    sns.histplot(x=df[feature_name], hue=df[target_col], kde=True, ax=ax[0])
    sns.boxplot(x=df[target_col], y=df[feature_name], ax=ax[1])
    fig.suptitle(f'{feature_name} by Target', fontsize=16)
    plt.tight_layout()

In [14]:
# plot_correlation(train, "target_reg", k=10)

In [15]:
# plot(train)