In [13]:
import pandas as pd
import seaborn as sns

In [22]:
df = pd.read_csv('../data/Tweets.csv')
df = df.drop(columns=['tweet_id'])

In [80]:
def describe(df: pd.DataFrame, num_columns: list[str], cat_columns: list[str], other_columns: list[str]):
    for column in num_columns:
        values = df[[column]].values
        print(f"Column [{column}]")
        print(f"\tmin:\t{values.min()}\n\tmean:\t{values.mean()}\n\tmax:\t{values.max()}")
        missing_count = df[[column]].isna().values.sum()
        print(f"missing data {missing_count}({missing_count/len(df)}%)")
        print('\n')
    
    for column in cat_columns:
        print(f"Column [{column}]")
        print("Value Counts\n", df[[column]].value_counts())
        missing_count = df[[column]].isna().values.sum()
        print(f"missing data {missing_count}({missing_count/len(df)}%)")
        print('\n')

    for column in other_columns:
        print(f"Column [{column}]")
        missing_count = df[[column]].isna().values.sum()
        print(f"missing data {missing_count}({missing_count/len(df)}%)")
        print('\n')

In [81]:
df.columns

Index(['airline_sentiment', 'airline_sentiment_confidence', 'negativereason',
       'negativereason_confidence', 'airline', 'airline_sentiment_gold',
       'name', 'negativereason_gold', 'retweet_count', 'text', 'tweet_coord',
       'tweet_created', 'tweet_location', 'user_timezone'],
      dtype='object')

In [82]:
num_columns = ['airline_sentiment_confidence', 'negativereason_confidence', 'retweet_count']
cat_columns = ['negativereason', 'airline', 'airline_sentiment_gold', 'name', 'negativereason_gold', 'tweet_location']
other_columns = ['text', 'tweet_coord', 'tweet_created', 'user_timezone']

In [83]:
describe(df, num_columns, cat_columns, other_columns)

Column [airline_sentiment_confidence]
	min:	0.335
	mean:	0.9001688524590163
	max:	1.0
missing data 0(0.0%)


Column [negativereason_confidence]
	min:	nan
	mean:	nan
	max:	nan
missing data 4118(0.2812841530054645%)


Column [retweet_count]
	min:	0
	mean:	0.08265027322404371
	max:	44
missing data 0(0.0%)


Column [negativereason]
Value Counts
 negativereason             
Customer Service Issue         2910
Late Flight                    1665
Can't Tell                     1190
Cancelled Flight                847
Lost Luggage                    724
Bad Flight                      580
Flight Booking Problems         529
Flight Attendant Complaints     481
longlines                       178
Damaged Luggage                  74
dtype: int64
missing data 5462(0.37308743169398906%)


Column [airline]
Value Counts
 airline       
United            3822
US Airways        2913
American          2759
Southwest         2420
Delta             2222
Virgin America     504
dtype: int64
missing data 0(0

### Things to do
#### General Notes
- `airline_sentiment` and possibly `airline_sentiment_confidence` are target columns (the latter cannot be in traning data)
- Remove instance of `"@airline"` tags from text 

####  How to handle each column
**Numerical Columns**
- `negativereason_confidence` -- fill missing data with 0
- `retweet_count` -- remove, almost 100% is just 0

**Categorical Columns**
- `negativereason` -- one hot encode top K reasons +1 column for "other"
- `airline` -- remove or one hot encode with "other" column
- `airline_sentiment_gold` -- remove, almost 100% missing data
- `name` -- remove, unique data
- `negative_reason_gold` -- remove, almost 100% missing data
- `tweet_location` -- remove or one hot encode with "other" column

**Other Columns**
- `tweet_coord` -- remove, almost 100% missing data
- `user_timezone` -- remove or one hot encode with "other" column
- `tweet_created` -- convert to columns: day of year (sin/cos), day of week, time of day (sin/cos)
- `text` -- ???
