<a href="https://colab.research.google.com/github/kamilamyslinska/Natural-Language-Processing-NLP/blob/main/Text_Preprocessing_.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

##Text Preprocessing 

###Feature Extraction


In [24]:
import requests 
import re
import pandas as pd
from matplotlib import pyplot as plt
from random import randint
import numpy as np
import warnings
warnings.filterwarnings("ignore")

%matplotlib inline

####File loading
The data comes from Instagram and concerns the Lodz-based clothing company Hexeline. These are data scraped from the company's public profile.


In [2]:
df = pd.read_csv('hashtaghexeline.csv')
df.sample(3)

Unnamed: 0,postUrl,profileUrl,username,fullName,commentCount,likeCount,pubDate,description,location,imgUrl,postId,ownerId,type,query,timestamp,isSidecar,sidecarMedias,videoUrl,viewCount
531,https://www.instagram.com/p/B0tdceUBOD7/,https://www.instagram.com/hexelineofficial,hexelineofficial,HEXELINE,5,49,2019-08-03T17:31:51.000Z,"Uwaga, czarujemy!🧚‍♀️⁠⠀\n⁠⠀\nZamknijcie oczy, ...",,https://scontent-lhr8-2.cdninstagram.com/v/t51...,2.102466e+18,1371708000.0,Photo,,2022-09-16T14:35:50.213Z,False,,,
260,https://www.instagram.com/p/CB6ASMZn-sQ/,https://www.instagram.com/hexelineofficial,hexelineofficial,HEXELINE,1,32,2020-06-26T17:15:19.000Z,"Zródłem stylu jest Twoje wnętrze, które najlep...",,https://scontent-lhr8-1.cdninstagram.com/v/t51...,2.340184e+18,1371708000.0,Photo,,2022-09-16T14:35:13.032Z,False,,,
530,https://www.instagram.com/p/B0wCSYTBgF9/,https://www.instagram.com/hexelineofficial,hexelineofficial,HEXELINE,3,31,2019-08-04T17:34:42.000Z,"Macie takie słowo, które działa na Was jak zak...",,,2.103191e+18,1371708000.0,Video,,2022-09-16T14:35:50.213Z,False,,https://scontent-lhr8-1.cdninstagram.com/v/t50...,289.0


In [3]:
df = df[df['description'].notnull()]

####Word counts


In [4]:
df['word_counts'] = df['description'].apply(lambda x: len(str(x).split()))

In [5]:
df['word_counts'].max()

255

In [6]:
df['word_counts'].min()

1

In [7]:
df['word_counts'].sample(3)

445    20
792    16
433    15
Name: word_counts, dtype: int64

####Characters count


In [8]:
def char_counts(x):
    s = x.split()
    x = ''.join(s)
    return len(x)

In [9]:
df['char_counts'] = df['description'].apply(lambda x: char_counts(str(x)))

In [10]:
df['char_counts'].sample(3)

273     96
695    161
127    435
Name: char_counts, dtype: int64

####Average characters per word


In [11]:
df['avg_word_len'] = df['char_counts']/df['word_counts']

In [12]:
df['avg_word_len'].sample(3)

777    9.044444
545    8.480000
43     9.313725
Name: avg_word_len, dtype: float64

####Count #HashTags and @Mentions


In [13]:
df['hashtags_count'] = df['description'].apply(lambda x: len([t for t in x.split() if t.startswith('#')]))

In [14]:
df['mentions_count'] = df['description'].apply(lambda x: len([t for t in x.split() if t.startswith('@')]))

In [15]:
df['hashtags_count'].sample(3)

837    11
407     0
766     0
Name: hashtags_count, dtype: int64

In [16]:
df['mentions_count'].sample(3)

737    0
627    1
679    1
Name: mentions_count, dtype: int64

####Upper case word counts

In [17]:
df['upper_counts'] = df['description'].apply(lambda x: len([t for t in x.split() if t.isupper()]))

In [18]:
df['upper_counts'].sample(3)

701    0
715    1
363    0
Name: upper_counts, dtype: int64

####If numeric digits are present in twitts

In [19]:
df['numerics_count'] = df['description'].apply(lambda x: len([t for t in x.split() if t.isdigit()]))

In [20]:
df['numerics_count'].sample(3)

372    0
165    0
654    0
Name: numerics_count, dtype: int64

##Preprocessing and Cleaning

####Lower Case Conversion

In [21]:
df['description'] = df['description'].apply(lambda x: str(x).lower())

In [22]:
df['description'].sample(3)

849    new hexeline ss2016 campaign\nphoto: @marcinke...
769    sylwestrowe odliczanie: czarny kombinezon "hex...
79     tenis, prążki paski, najkorzystniej w pionie 👌...
Name: description, dtype: object

####Count and Remove Emails

In [25]:
df['emails'] = df['description'].apply(lambda x: re.findall(r'([a-z0-9+._-]+@[a-z0-9+._-]+\.[a-z0-9+_-]+\b)', x))

In [26]:
df['emails_count'] = df['emails'].apply(lambda x: len(x))

In [27]:
df[df['emails_count']>0]

Unnamed: 0,postUrl,profileUrl,username,fullName,commentCount,likeCount,pubDate,description,location,imgUrl,...,viewCount,word_counts,char_counts,avg_word_len,hashtags_count,mentions_count,upper_counts,numerics_count,emails,emails_count


In [28]:
df['twitts'] = df['description'].apply(lambda x: re.sub(r'([a-z0-9+._-]+@[a-z0-9+._-]+\.[a-z0-9+_-]+)',"", x))

In [29]:
df[df['emails_count']>0]

Unnamed: 0,postUrl,profileUrl,username,fullName,commentCount,likeCount,pubDate,description,location,imgUrl,...,word_counts,char_counts,avg_word_len,hashtags_count,mentions_count,upper_counts,numerics_count,emails,emails_count,twitts


####Count URLs and Remove it

In [30]:
df['url_flags'] = df['description'].apply(lambda x: len(re.findall(r'(http|https|ftp|ssh)://([\w_-]+(?:(?:\.[\w_-]+)+))([\w.,@?^=%&:/~+#-]*[\w@?^=%&/~+#-])?', x)))

In [31]:
df[df['url_flags']>0].sample(3)

Unnamed: 0,postUrl,profileUrl,username,fullName,commentCount,likeCount,pubDate,description,location,imgUrl,...,char_counts,avg_word_len,hashtags_count,mentions_count,upper_counts,numerics_count,emails,emails_count,twitts,url_flags
198,https://www.instagram.com/p/CFVhcGLnYRq/,https://www.instagram.com/freakmeaning,freakmeaning,fre4k,0,100,2020-09-19T23:18:41.000Z,i'm trying out his program i discovered called...,,https://scontent-lhr8-2.cdninstagram.com/v/t51...,...,472,9.076923,30,0,1,0,[],0,i'm trying out his program i discovered called...,2
733,https://www.instagram.com/p/BSf44IHAOBz/,https://www.instagram.com/fashion_coco_club,fashion_coco_club,COCO SALON ODZIEZOWY,1,21,2017-04-05T09:14:48.000Z,nowa dostawa świątecznych sukienek i bluzek z ...,,https://scontent-lhr8-1.cdninstagram.com/v/t51...,...,385,9.871795,13,0,0,0,[],0,nowa dostawa świątecznych sukienek i bluzek z ...,2
734,https://www.instagram.com/p/BSf4ofAgJw8/,https://www.instagram.com/fashion_coco_club,fashion_coco_club,COCO SALON ODZIEZOWY,2,19,2017-04-05T09:12:40.000Z,nowa dostawa świątecznych sukienek i bluzek z ...,,https://scontent-lhr8-2.cdninstagram.com/v/t51...,...,311,9.147059,11,0,0,0,[],0,nowa dostawa świątecznych sukienek i bluzek z ...,1


In [32]:
df['description'] = df['description'].apply(lambda x: re.sub(r'(http|https|ftp|ssh)://([\w_-]+(?:(?:\.[\w_-]+)+))([\w.,@?^=%&:/~+#-]*[\w@?^=%&/~+#-])?', '' , x))

In [33]:
df['description'].sample(3)

348    pewność siebie pozwala współczesnym czarownico...
920    day 2 / hexeline lookbook in @studiopzo #fashi...
252    lekki i niezwykle delikatny jedwab podany na e...
Name: description, dtype: object

###Special Chars removal or punctuation removal

In [34]:
df['description'] = df['description'].apply(lambda x: re.sub(r'[^\w ]+', "", x))

In [35]:
df['description'].sample(3)

770    przed nami sylwester a potem czas karnawału le...
814    na poziomie 0 w naszym chr galaxy znajdziecie ...
902                      hexeline fashion lookbook white
Name: description, dtype: object

####Remove multiple spaces

In [36]:
df['description'] = df['description'].apply(lambda x: ' '.join(x.split()))

In [37]:
df['description'].sample(3)

187    kuszące desenie są nieodłączną częścią kobiece...
112    ponadczasowa klasyka moc detali nowoczesność i...
214                                 autumnvibes hexeline
Name: description, dtype: object

####Remove Accented Chars

In [38]:
import unicodedata

In [39]:
def remove_accented_chars(x):
    x = unicodedata.normalize('NFKD', x).encode('ascii', 'ignore').decode('utf-8', 'ignore')
    return x

In [41]:
df['description'] = df['description'].apply(lambda x: remove_accented_chars(x))

In [42]:
df['description'].sample(3)

118    dzis obchodzimy magiczne swieto swiata porzadk...
159    doskonay paszcz w genialnym kolorze hexeline 1...
613    drogie dziewczyny z okazji mikoajek na haso an...
Name: description, dtype: object