In [119]:
import pandas as pd

**CREATING INDIVIDUAL DATAFRAMES**

In [120]:
amazon_df = pd.read_csv('amazon_reviews.csv')
twitter_df = pd.read_csv('Tweets.csv')
yelp_df = pd.read_csv('yelp.csv')

In [121]:
print("--------------------------AMAZON---------------------")
print(amazon_df.columns)
print("\n--------------------------TWITTER---------------------")
print(twitter_df.columns)
print("\n--------------------------YELP---------------------")
print(yelp_df.columns)

--------------------------AMAZON---------------------
Index(['Unnamed: 0', 'reviewerName', 'overall', 'reviewText', 'reviewTime',
       'day_diff', 'helpful_yes', 'helpful_no', 'total_vote',
       'score_pos_neg_diff', 'score_average_rating', 'wilson_lower_bound'],
      dtype='object')

--------------------------TWITTER---------------------
Index(['textID', 'text', 'selected_text', 'sentiment'], dtype='object')

--------------------------YELP---------------------
Index(['business_id', 'date', 'review_id', 'stars', 'text', 'type', 'user_id',
       'cool', 'useful', 'funny'],
      dtype='object')


**PERFORMING DATA LOADING BY CHOOSING ONLY REQUIRED COLUMNS ON DATAFRAME**

In [122]:
def data_loading(df,features,label):
    df_reduced = df[features + label]      # + ----> concatanation 
    df_reduced = df_reduced.dropna(axis=0,how='any') #removing rows having nan
    df_reduced.columns = ['review','analysis']
    return df_reduced

EXTRACTING REQUIRED COLUMNS FOR AMAZON REVIEWS

In [123]:
features = ['reviewText']
label = ['overall']
amazon_df_reduced = data_loading(amazon_df,features,label)
amazon_df_reduced.head()

Unnamed: 0,review,analysis
0,No issues.,4.0
1,"Purchased this for my device, it worked as adv...",5.0
2,it works as expected. I should have sprung for...,4.0
3,This think has worked out great.Had a diff. br...,5.0
4,"Bought it with Retail Packaging, arrived legit...",5.0


EXTRACTING REQUIRED COLUMNS FOR TWITTER REVIEWS

In [124]:
features = ['text']
label = ['sentiment']
twitter_df_reduced = data_loading(twitter_df,features,label)
twitter_df_reduced.head()

Unnamed: 0,review,analysis
0,"I`d have responded, if I were going",neutral
1,Sooo SAD I will miss you here in San Diego!!!,negative
2,my boss is bullying me...,negative
3,what interview! leave me alone,negative
4,"Sons of ****, why couldn`t they put them on t...",negative


EXTRACTING REQUIRED COLUMNS FOR YELP REVIEWS

In [125]:
features = ['text']
label = ['stars']
yelp_df_reduced = data_loading(yelp_df,features,label)

**DEALING WITH LABELS**

In [126]:
print(amazon_df_reduced['analysis'].unique())
print(twitter_df_reduced['analysis'].unique())
print(yelp_df_reduced['analysis'].unique())

[4. 5. 3. 1. 2.]
['neutral' 'negative' 'positive']
[5 4 2 3 1]


MAPPING LABELS OF AMAZON REVIEW TO ['NEGATIVE','NEUTRAL','POSITIVE]

In [127]:
for x in amazon_df_reduced['analysis'].unique():
    if int(x)<=2:
        amazon_df_reduced.loc[amazon_df_reduced['analysis']==x,'analysis'] = 'negative'
    elif int(x)==3:
        amazon_df_reduced.loc[amazon_df_reduced['analysis']==x,'analysis'] = 'neutral'
    elif int(x)>=4:
        amazon_df_reduced.loc[amazon_df_reduced['analysis']==x,'analysis'] = 'positive'


MAPPING LABELS OF TWITTER REVIEW TO ['NEGATIVE','NEUTRAL','POSITIVE]

In [128]:
for x in twitter_df_reduced['analysis'].unique():
    if x == 'negative':
        twitter_df_reduced.loc[twitter_df_reduced['analysis']==x,'analysis'] = 'negative'
    elif x == 'neutral':
        twitter_df_reduced.loc[twitter_df_reduced['analysis']==x,'analysis'] = 'neutral'
    elif x == 'positive':
        twitter_df_reduced.loc[twitter_df_reduced['analysis']==x,'analysis'] = 'positive'


MAPPING LABELS OF AMAZON YELP TO ['NEGATIVE','NEUTRAL','POSITIVE]

In [129]:
for x in yelp_df_reduced['analysis'].unique():
    if int(x)<=2:
        yelp_df_reduced.loc[yelp_df_reduced['analysis']==x,'analysis'] = 'negative'
    elif int(x)==3:
        yelp_df_reduced.loc[yelp_df_reduced['analysis']==x,'analysis'] = 'neutral'
    elif int(x)>=4:
        yelp_df_reduced.loc[yelp_df_reduced['analysis']==x,'analysis'] = 'positive'

**AGGREGATING DATAFRAMES**

In [130]:
df = pd.concat([amazon_df_reduced,twitter_df_reduced,yelp_df_reduced],axis=0,names=['review','analysis'])

print("Number of classes :",len(df['analysis'].unique()))
print()
print(df.shape[0] == amazon_df_reduced.shape[0]+twitter_df_reduced.shape[0]+yelp_df_reduced.shape[0])
print()
print(df.isna().sum())

Number of classes : 3

True

review      0
analysis    0
dtype: int64


In [131]:
X = df['review'].to_frame()
y = df['analysis'].to_frame()
X.shape[0] == y.shape[0]

True