In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
import numpy as np
import pickle

In [2]:
np.random.seed(828)

In [3]:
label_df = pd.read_csv('./tweets_sampled_test_0602.csv')

In [4]:
label_df['Sentiment'].isna().sum()

77

In [5]:
label_df['Sentiment'].value_counts()

0       236
1       112
-1       43
?        31
?, 1      1
Name: Sentiment, dtype: int64

In [6]:
label_df = label_df[label_df['Sentiment'].isin(['0', '1', '-1'])].copy()
label_df['id'] = label_df['id'].str.replace('s', '')
label_df['Sentiment'] = label_df['Sentiment'].astype(int)

In [7]:
train_df, test_df = train_test_split(label_df, test_size = 100)

In [8]:
train_df.shape, test_df.shape

((291, 4), (100, 4))

In [9]:
train_df['Sentiment'].value_counts()

 0    173
 1     87
-1     31
Name: Sentiment, dtype: int64

In [10]:
test_df['Sentiment'].value_counts()

 0    63
 1    25
-1    12
Name: Sentiment, dtype: int64

In [11]:
train_id_to_sentiment = train_df.set_index('id')['Sentiment'].to_dict()
test_id_to_sentiment = test_df.set_index('id')['Sentiment'].to_dict()

In [12]:
id_to_sentiment_map = {
    'train': train_id_to_sentiment,
    'test': test_id_to_sentiment
}

In [13]:
with open('id_to_sentiment_0602.pkl', 'wb') as f:
    pickle.dump(id_to_sentiment_map, f)

In [14]:
with open('id_to_sentiment_0602.pkl', 'rb') as f:
    id_to_sentiment_map = pickle.load(f)

In [15]:
train_id_to_sentiment, test_id_to_sentiment = id_to_sentiment_map['train'], id_to_sentiment_map['test']

In [16]:
tweet_df = pd.read_pickle('./tweets_sampled_0602.pkl')

In [17]:
train_tweet_df = tweet_df[tweet_df['id'].isin(train_id_to_sentiment.keys())].copy()
test_tweet_df = tweet_df[tweet_df['id'].isin(test_id_to_sentiment.keys())].copy()

In [18]:
train_tweet_df.shape, test_tweet_df.shape

((291, 17), (100, 17))

In [19]:
train_tweet_df['sentiment'] = train_tweet_df['id'].map(train_id_to_sentiment)
test_tweet_df['sentiment'] = test_tweet_df['id'].map(test_id_to_sentiment)

In [26]:
train_corpus = train_tweet_df['text'].values
train_sentiment = train_tweet_df['sentiment'].values

test_corpus = test_tweet_df['text'].values
test_sentiment = test_tweet_df['sentiment'].values

In [27]:
corpus_and_sentiment = {
    'train': {
        'corpus': train_corpus,
        'sentiment': train_sentiment
    },
    'test': {
        'corpus': test_corpus,
        'sentiment': test_sentiment
    }
}

In [28]:
with open('./corpus_and_sentiment.pkl', 'wb') as f:
    pickle.dump(corpus_and_sentiment, f)