In [None]:
import nltk
from nltk.probability import FreqDist
import nltk.corpus  
from nltk.text import Text
from nltk.util import bigrams

### Exploring Twitter Descriptions of Land Trust Accounts

In [None]:
# Exploring the descriptions of the land trust Twitter accounts
file_location = "/Users/hannahleonard/Documents/UM/2019_Fall/ADA/AA_Project/Data/"
file_name = "20191130_landtrusts_userobj.txt"

descs = []
with open(file_location + file_name,'r') as ifile :
    next(ifile)
    for idx, line in enumerate(ifile.readlines()) :
        line = line.strip().split("\t")
        
        # spot 6 has the description
        if len(line) >= 7 : # sometimes there is not a description
            descs.extend(line[6].split())
        
        # add on to a big list

In [None]:
with open(file_location + file_name,'r') as ifile :
    print(ifile.readline())
    print(ifile.readline())

In [None]:
# Amount of descriptions we have
len(descs)

In [None]:
# 10 most common words in the descriptions
fd = FreqDist(descs)
fd.most_common(10)

In [None]:
# Clean this list to exclude stop words and only alphanumeric words
stopwords = set(nltk.corpus.stopwords.words("english"))
stopwords_sp = set(nltk.corpus.stopwords.words("spanish"))

def clean_list(text) :
    ''' takes a list of text and returns a new list with 
        * words cast to lowercase
        * stopwords removed
        * only alphanumeric words
    '''
    text_clean = [w.lower() for w in text if w.isalpha()]
    text_clean = [w for w in text_clean if w not in stopwords]
    text_clean = [w for w in text_clean if w not in stopwords_sp]
    return(text_clean)

In [None]:
# Creates the clean list
descs_clean_LT = clean_list(descs)

In [None]:
# New clean 20 most common words in the descriptions
fd_LT = FreqDist(descs_clean_LT)
fd_LT.most_common(20)

### Concordance

In [None]:
lt_desc = Text(descs_clean_LT)
lt_desc.concordance('land')

In [None]:
lt_desc.concordance('conservation')

### Similar Words

In [None]:
lt_desc.similar('land')

In [None]:
lt_desc.similar('conservation')

### Bigrams

In [None]:
# nltk work around for .collection() issue
print('; '.join(lt_desc.collocation_list()))

### Distinct Words

In [None]:
# all words used in descriptions sorted alphabetically
sorted(set(lt_desc))

In [None]:
# number of unique words in descriptions
len(set(lt_desc))

In [None]:
len(set(lt_desc))/len(lt_desc)

### Location

In [None]:
# Exploring the locations of the land trust Twitter accounts
file_location = "/Users/hannahleonard/Documents/UM/2019_Fall/ADA/AA_Project/Data/"
file_name = "20191130_landtrusts_userobj.txt"

geo = []
with open(file_location + file_name,'r') as ifile :
    next(ifile)
    for idx, line in enumerate(ifile.readlines()) :
        line = line.strip().split("\t")
        
        # spot 3 has the location
        if len(line) >= 4 : # sometimes there is not a location
            geo.extend(line[3].split())
        
        # add on to a big list

In [None]:
# edited function to capture states in location spot
def clean_list(text) :
    ''' takes a list of text and returns a new list with 
        * words cast to uppercase to catch states
        * only alphanumeric words
    '''
    text_clean = [w.upper() for w in text if w.isalpha()]
    return(text_clean)

In [None]:
# Creates the clean list
geo_clean = clean_list(geo)

In [None]:
# top 5 locations of organizations 
fd = FreqDist(geo_clean)
fd.most_common(10)

#JOHN - when doing more (10, 20, etc) the locations included things like "NEW", "SAN"
# which are not states. Is this more of a user input issue or a cleaning issue?

---

### Exploring Tweets of Land Trust Accounts

In [None]:
# Exploring the tweets of the land trust Twitter accounts
file_location = "/Users/hannahleonard/Documents/UM/2019_Fall/ADA/AA_Project/Data/"
file_name = "all_clean_tweets.txt"

tweets = []
with open(file_location + file_name,'r') as ifile :
    next(ifile)
    for idx, line in enumerate(ifile.readlines()) :
        line = line.strip().split("\t")
        
        # spot 3 has the tweet
        if len(line) >= 4 : # in case there isn't a tweet
            tweets.extend(line[3].split())
        # add on to a big list

In [None]:
with open(file_location + file_name,'r') as ifile :
    print(ifile.readline())
    print(ifile.readline())

In [None]:
len(tweets)

In [None]:
# 10 most common words in the tweets
fd = FreqDist(tweets)
fd.most_common(10)

In [None]:
# Clean this list to exclude stop words and only alphanumeric words
stopwords = set(nltk.corpus.stopwords.words("english"))
stopwords_sp = set(nltk.corpus.stopwords.words("spanish"))

def clean_list(text) :
    ''' takes a list of text and returns a new list with 
        * words cast to lowercase
        * stopwords removed
        * only alphanumeric words
    '''
    text_clean = [w.lower() for w in text if w.isalpha()]
    text_clean = [w for w in text_clean if w not in stopwords]
    text_clean = [w for w in text_clean if w not in stopwords_sp]
    return(text_clean)

In [None]:
# Creates the clean list
clean_tweets = clean_list(tweets)

In [None]:
# New clean 20 most common words in the tweets
fd_CT = FreqDist(clean_tweets)
fd_CT.most_common(20)

### Concordance

In [None]:
lt_tweet = Text(clean_tweets)
lt_tweet.concordance('money')

In [None]:
lt_tweet.concordance('conservation')

### Common Contexts

In [None]:
lt_tweet.common_contexts(['conservation','legacy','money'])

### Similar Words

In [None]:
lt_tweet.similar('land')

In [None]:
lt_tweet.similar('legacy')

### Bigrams

In [None]:
# nltk work around for .collection() issue
print('; '.join(lt_tweet.collocation_list()))

### Distinct Words

In [None]:
# all words used in descriptions sorted alphabetically
sorted(set(lt_tweet))

In [None]:
# number of unique words in descriptions
len(set(lt_tweet))

In [None]:
len(set(lt_tweet))/len(lt_tweet)