### Importing Libraries

In [1]:
import re, numpy as np, pandas as pd

from sklearn.feature_extraction.text import ENGLISH_STOP_WORDS
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer

from nltk.corpus import stopwords

import matplotlib.pyplot as plt
%matplotlib inline

import nltk
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/nathennavon/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

### Loading Data

In [2]:
fashion = pd.read_pickle('../data/fashion_df.pkl')

### EDA & Cleaning

In [3]:
fashion.shape

(113881, 4)

In [4]:
fashion.columns

Index(['created', 'handle', 'text', 'tweet_id'], dtype='object')

In [5]:
fashion.text[0]

'For Spring/Summer 2018, the #AcneStudios sneakers range is being brought to life by Tage Johansson – the 12 year old son of Jonny Johansson, Acne Studios Creative Director. Discover more: https://t.co/bWYrEt8heu https://t.co/1ieBXmo8V9'

In [6]:
fashion.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 113881 entries, 0 to 113880
Data columns (total 4 columns):
created     113881 non-null object
handle      113868 non-null object
text        113868 non-null object
tweet_id    113855 non-null float64
dtypes: float64(1), object(3)
memory usage: 3.5+ MB


### Fixing Nulls

In [7]:
error_collect = fashion[~fashion.created.str.match('\w{3} \w{3}')].index

In [8]:
for error in error_collect:
    fashion['tweet_id'][error-1] = fashion.created[error]
    print(fashion.iloc[error-1,:]) #checking for an error 
    
fashion = fashion.drop(labels=error_collect)
fashion.to_pickle('../data/fashion_clean.pkl')

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  


created                        Fri May 06 00:16:29 +0000 2016
handle                                                Bulgari
text        Live from Paris artist @stuartsemple painted o...
tweet_id                                          7.28378e+17
Name: 10513, dtype: object
created                     Sat May 13 13:49:39 +0000 2017
handle                                                DKNY
text        .@Metal_Magazine photographs the #DKNY Spring 
tweet_id                                       8.63391e+17
Name: 33519, dtype: object
created                        Sun Dec 10 17:45:53 +0000 2017
handle                                        Dolce & Gabbana
text        Quite the duo of festive reds! Classic Cream L...
tweet_id                                          9.39914e+17
Name: 34864, dtype: object
created                        Tue Jan 02 20:34:32 +0000 2018
handle                                                  gucci
text        Inside the November issue of @Allure_magazine ...
t

In [9]:
#making sure there
fashion.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 113868 entries, 0 to 113880
Data columns (total 4 columns):
created     113868 non-null object
handle      113868 non-null object
text        113868 non-null object
tweet_id    113868 non-null float64
dtypes: float64(1), object(3)
memory usage: 4.3+ MB


### Removing non alphanumeric characters 

In [10]:
def tweet_cleaner(text):
    return ' '.join(re.sub("(@[A-Za-z0-9]+)|([^0-9A-Za-z \t])|(\w+:\/\/\S+)"," ",text).split())

fashion['clean'] = fashion.text.apply(tweet_cleaner)

# checking to see if #'s, @handle, and links are removed by regex tweet_cleaner

fashion[['text', 'clean']].head(8)

Unnamed: 0,text,clean
0,"For Spring/Summer 2018, the #AcneStudios sneak...",For Spring Summer 2018 the AcneStudios sneaker...
1,"RT @i_D: according to acne studios, this young...",RT D according to acne studios this young fash...
2,The guests at the #AcneStudios West Hollywood ...,The guests at the AcneStudios West Hollywood s...
3,#AcneStudiosBlåKonst Land gum are loose fittin...,AcneStudiosBl Konst Land gum are loose fitting...
4,"Introducing Ingridh, as worn by Juliette Lewis...",Introducing Ingridh as worn by Juliette Lewis ...
5,RT @HYPEBEAST: Acne Studios second store in L....,RT Acne Studios second store in L A
6,"Now open – #AcneStudios West Hollywood, a stor...",Now open AcneStudios West Hollywood a store th...
7,"#AcneStudios Rellah is a loose, boxy fit, shor...",AcneStudios Rellah is a loose boxy fit short s...


### Formating df

In [11]:
#switching date of tweet dtype from object to date-time
fashion['created'] = pd.to_datetime(fashion.created)

#setting handle as index
fashion.set_index('handle', inplace=True)

fashion.dtypes

created     datetime64[ns]
text                object
tweet_id           float64
clean               object
dtype: object

In [12]:
fashion.groupby(fashion.index).size().sort_values(ascending=False)

handle
Roberto Cavalli        3247
Dolce & Gabbana        3245
Vera Wang              3244
Valentino              3240
Stella McCartney       3238
Missoni                3238
Oscar de la Renta      3234
Michael Kors           3234
VERSACE                3232
HUGO BOSS              3231
gucci                  3231
Marc Jacobs            3226
CALVIN KLEIN           3225
Fendi                  3224
AlbertaFerretti        3221
COMME des GARÇONS      3219
ETRO                   3217
Burberry               3216
Giorgio Armani         3212
Alexander McQueen      3209
Louis Vuitton          3209
Tommy Hilfiger         3205
Christian Louboutin    3204
Balmain                3204
Off-White™             3175
Derek Lam              3014
Bulgari                2853
Chloé                  2813
LANVIN                 2704
Dior                   2567
PRADA                  2005
KARL LAGERFELD         1978
CHANEL                 1819
Herve Leger            1409
DKNY                   1278
MIU MIU      

In [13]:
fashion.shape

(113868, 4)

In [14]:
fashion.to_pickle('../data/fashion_clean.pkl')