In [2]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib
import matplotlib.pyplot as plt
from sklearn.feature_selection import RFE
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import OrdinalEncoder

  import pandas.util.testing as tm


### Reading input files for both train and test data and merging them together to work on the combined data further

In [0]:
df_train = pd.read_csv('/content/drive/My Drive/capstone_train.csv')
df_test = pd.read_csv('/content/drive/My Drive/capstone_test.csv')
df_final = pd.concat([df_train,df_test],axis =0)
df_final  = df_final.dropna(axis='columns')
df_final.Plant_Type[df_final.Plant_Type=='Myconid:']="Myconid"
df_final = df_final.reset_index(drop=True)

In order to segment the types of plants based on their average distance from the water source and their sunlight consumption, we treat the following parameters as averages:

1.Average Distance is the average of distance from water source and its standing distance from water source

2.Average sunlight is the average of shadow index for morning and evening(amount of sunlight is proportional to shadow in morning and evening)


In [10]:
dd = df_final.copy(deep =True)
dd['Average_sunlight'] = pd.Series([(i+j)/2 for i,j in zip(dd.Shadow_In_Evening,dd.Shadow_In_Morning)])
dd['Average_Distance'] = pd.Series([(i+j)/2 for i,j in zip(dd.Distance_To_Water_Source,dd.Standing_Distance_To_Water_Source)])
dd = dd.loc[:,['Average_Distance','Average_sunlight','Plant_Type']]
dd

Unnamed: 0,Average_Distance,Average_sunlight,Plant_Type
0,148.0,177.0,Assassin vine
1,501.0,178.0,Assassin vine
2,384.5,186.0,Ascomoid
3,95.0,191.0,Assassin vine
4,335.5,186.5,Assassin vine
...,...,...,...
581007,179.5,183.5,Ascomoid
581008,138.5,182.0,Assassin vine
581009,171.5,173.0,Ascomoid
581010,124.5,165.5,Assassin vine


## Creating two DataFrames with conditions favorable and unfavorable for the survival for plant species.

1.Favorable is defined as the condition for a plant if it receives more than avearge values of sunlight thorughout the day and has distance less than the average distance from the water sources(or is closer to water)

2.Unfavorable is defined as the condition for a plant if it receives less than avearge value of sunlight thorughout the day and has distance more than the average distance from the water sources(or is farther from water)

In [0]:
favorable_df = dd[(dd['Average_Distance'] < np.mean(dd['Average_Distance'])) & (dd['Average_sunlight'] > np.mean(dd['Average_sunlight']))]
unfavorable_df = dd[(dd['Average_Distance'] > np.mean(dd['Average_Distance'])) & (dd['Average_sunlight'] < np.mean(dd['Average_sunlight']))]
survival_ratio = 100*(len(favorable_df)/len(dd))
threat_ratio = 100*(len(unfavorable_df)/len(dd))

## Now we have the plants whose conditions are favorable as unfavorable

Now let us extract the information of the key species which is present in favorable/unfavorable environment

In [20]:
#extracting species with unfavorable conditions
unfavorable_species = {}
for i in unfavorable_df.Plant_Type.unique():
  unfavorable_species[i] = unfavorable_df.Plant_Type[unfavorable_df.Plant_Type==i].count()
unfavorable_species

{'Ascomoid': 38467,
 'Assassin vine': 45941,
 'Basidirond': 7083,
 'Dark tree': 432,
 'Hangman tree': 1611,
 'Kelpie': 2557,
 'Myconid': 4783}

From above dictionary, the top 3 plant species which are under unfavorable conditions are- 'Ascomoid', 'Assassin vine' and 'Basidirond'

In [19]:
#plant species with favorable conditions-storing them in a dictionary with their count
favorable_species = {}
for i in favorable_df.Plant_Type.unique():
  favorable_species[i] = favorable_df.Plant_Type[favorable_df.Plant_Type==i].count()
favorable_species

{'Ascomoid': 79868,
 'Assassin vine': 97795,
 'Basidirond': 9533,
 'Dark tree': 904,
 'Hangman tree': 3303,
 'Kelpie': 4934,
 'Myconid': 5041}

This tells us that the species 'Dark Tree' has the least number of plants that have favorable conditions

# Conclusion

## There are two inferences that are obtained from the above analysis
### 1. The species 'Ascomoid', 'Assassin vine' and 'Basidirond' need more attention and plants with affinity towards favorable conditions
### 2. The plant species named Dark tree has the fewest number for favorable conditions, and we might want to increase the number

### Note: For the second inference, there might be a possibility that the Dark tree is a plant for which the favorable conditions are in contrast to other species(as the name suggests dark) and we could then change the criteria of favorable conditions for the same